From 1a4a89b090f9e3eb234c29fa227bbba2df9f2438 Mon Sep 17 00:00:00 2001 From: pborcin Date: Fri, 8 Nov 2024 16:21:57 +0100 Subject: [PATCH] add rgb565 acceleration --- .../simd/lv_color_blend_to_argb8888_esp32s3.S | 4 +- .../simd/lv_color_blend_to_rgb565_esp32s3.S | 260 +++++++++++++++++- .../esp_lvgl_port/test_apps/simd/README.md | 11 + .../simd/main/test_lv_fill_functionality.c | 4 +- 4 files changed, 272 insertions(+), 7 deletions(-) diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S index 4d9f84f1..bb3956e6 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S @@ -184,8 +184,6 @@ lv_color_blend_to_argb8888_esp: // dest_w (a4) - 4-byte multiple - mov a13, a3 - ee.zero.q q1 // clear q1 ee.orq q1, q1, q0 // copy q0 to q1 sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes @@ -242,7 +240,7 @@ lv_color_blend_to_argb8888_esp: ee.vst.128.ip q2, a3, 16 // store 16 bytes from q0 to dest_buff a3 ._main_loop_unaligned_by_1byte: - // Firstly check mod 0 and mod 1 - correcting the aligned memory access + // Firstly check mod 1 and mod 2 - correcting the aligned memory access // Go back in one Byte, allow to correct after ee.vst.128.ip aligned access addi a3, a3, -4 diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S index 07b5aa11..ee9f8a9c 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This is LVGL RGB565 simple fill for ESP32 processor +// This is LVGL RGB565 simple fill for ESP32S3 processor .section .text .align 4 @@ -31,7 +31,8 @@ lv_color_blend_to_rgb565_esp: - entry a1, 32 + entry a1, 32 + ee.zero.q q0 // dummy TIE instruction, to enable the TIE l32i.n a3, a2, 4 // a3 - dest_buff l32i.n a4, a2, 8 // a4 - dest_w in uint16_t @@ -63,6 +64,261 @@ lv_color_blend_to_rgb565_esp: and a10, a10, a13 or a10, a10, a12 // a10 = 32-bit color (16bit + (16bit << 16)) + // Check for short lengths + // dest_w should be at least 16, othewise it's not worth using esp32s3 TIE + bgei a4, 16, _esp32s3_implementation // Branch if dest_w is greater than or equal to 16 + j .lv_color_blend_to_rgb565_esp32_body // Jump to esp32 implementation + + _esp32s3_implementation: + + ee.movi.32.q q0, a10, 0 // fill q0 register from a10 by 32 bits + ee.movi.32.q q0, a10, 1 + ee.movi.32.q q0, a10, 2 + ee.movi.32.q q0, a10, 3 + + // Check dest_buff alignment + movi.n a7, 0xf // 0xf alignment mask (16-byte alignment) + and a15, a7, a3 // 16-byte alignment mask AND dest_buff pointer + bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero + + // Check dest_stride alignment + and a15, a7, a6 // 16-byte alignment mask AND dest_stride + bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero + + // Check dest_w_bytes alignment + and a15, a7, a11 // 16-byte alignment mask AND dest_w_bytes + bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero + +//********************************************************************************************************************** + + // all aligned, the most ideal case + + // dest_buff (a3) - 16-byte aligned + // dest_stride (a6) - 16-byte multiple + // dest_w (a4) - 16-byte multiple + + srli a9, a4, 3 // a9 - loop_len = dest_w / 8 + sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes + + .outer_loop_aligned: + + loopnez a9, ._main_loop_aligned // 16 bytes (8 rgb565) in one loop + ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3 + ._main_loop_aligned: + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + bnez a5, .outer_loop_aligned + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return + + _unaligned_by_4byte: + + // Check dest_buff alignment + movi.n a7, 0x3 // 0x3 alignment mask (4-byte alignment) + and a15, a7, a3 // 4-byte alignment mask AND dest_buff pointer + bnez a15, _unaligned_by_1byte // branch if a15 not equals to zero + + // Check dest_stride alignment + and a15, a7, a6 // 4-byte alignment mask AND dest_stride pointer + bnez a15, _unaligned_by_1byte // branch if a15 not equals to zero + +//********************************************************************************************************************** + + // either dest_buff or dest_stride is not 16-byte aligned + // dest_w is always 4-byte multiple + // all of the following are 4-byte aligned + + // dest_buff (a3) - 16-byte, or 4-byte aligned + // dest_stride (a6) - 16-byte, or 4-byte multiple + // dest_w (a4) - 4-byte multiple + + sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes + movi.n a7, 0xf // 0xf alignment mask + + .outer_loop_aligned_by_4byte: + + // alignment check + and a15, a7, a3 // 0xf (alignment mask) AND dest_buff pointer + mov a12, a11 // a12 - local_dest_w_bytes = dest_w_bytes + beqz a15, _dest_buff_aligned_by_4byte // branch if a15 equals to zero + + + movi.n a14, 16 // a14 - 16 + sub a15, a14, a15 // a15 = 16 - unalignment (lower 4 bits of dest_buff address) + sub a12, a12, a15 // local_dest_w_bytes = len - (16 - unalignment) + + // keep setting until dest_buff is aligned + // Check modulo 8 of the unalignment, if - then set 8 bytes + bbci a15, 3, _aligning_mod_8_check_4byte // branch if 3-rd bit of unalignment a15 is clear + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 4 bytes + addi.n a3, a3, 8 // increment dest_buff pointer by 8 bytes + _aligning_mod_8_check_4byte: + + // Check modulo 4 of the unalignment, if - then set 4 bytes + bbci a15, 2, _aligning_mod_4_check_4byte // branch if 2-nd bit unalignment a15 is clear + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + _aligning_mod_4_check_4byte: + + _dest_buff_aligned_by_4byte: + // Calculate main loop_len + srli a9, a12, 4 // a9 - loop_len = local_dest_w_bytes / 16 + + // Main loop + loopnez a9, ._main_loop_unaligned_by_4byte // 16 bytes (8 rgb565) in one loop + ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3 + ._main_loop_unaligned_by_4byte: + + // Check modulo 8 of the dest_w, if - then set 8 bytes + bbci a12, 3, _aligned_mod_8_check_4byte // branch if 3-rd bit of local_dest_w_bytes a12 is clear + ee.vst.l.64.ip q0, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + _aligned_mod_8_check_4byte: + + // Check modulo 4 of the dest_w, if - then set 4 bytes + bbci a12, 2, _aligned_mod_4_check_4byte // branch if 2-nd bit of local_dest_w_bytes a12 is clear + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + _aligned_mod_4_check_4byte: + + // Check modulo 2 of the dest_w, if - then set 2 bytes + bbci a12, 1, _aligned_mod_2_check_4byte // branch if 1-st bit of local_dest_w_bytes a12 is clear + s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + _aligned_mod_2_check_4byte: + + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + bnez a5, .outer_loop_aligned_by_4byte + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return + + _unaligned_by_1byte: + +//********************************************************************************************************************** + + // either dest_buff or dest_stride is not 4-byte aligned + // dest_w is always 4-byte multiple + + // dest_buff (a3) - 4-byte, or 1-byte aligned + // dest_stride (a6) - 4-byte, or 1-byte multiple + // dest_w (a4) - 4-byte multiple + + + ee.zero.q q1 // clear q1 + ee.orq q1, q1, q0 // copy q0 to q1 + sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes + movi.n a7, 0xf // 0xf alignment mask + + .outer_loop_aligned_by_1byte: + + // alignment check + and a15, a7, a3 // 0xf (alignment mask) AND dest_buff pointer + mov a12, a11 // a12 - local_dest_w_bytes = dest_w_bytes + beqz a15, _dest_buff_aligned_by_1byte // branch if a15 equals to zero + + + movi.n a14, 16 // a14 - 16 + sub a15, a14, a15 // a15 = 16 - unalignment (lower 4 bits of dest_buff address) + sub a12, a12, a15 // local_dest_w_bytes = len - (16 - unalignment) + + // keep setting until dest_buff is aligned + // Check modulo 8 of the unalignment, if - then set 8 bytes + bbci a15, 3, _aligning_mod_8_check_1byte// branch if 3-rd bit of unalignment a15 is clear + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 4 bytes + addi.n a3, a3, 8 // increment dest_buff pointer by 8 bytes + _aligning_mod_8_check_1byte: + + // Check modulo 4 of the unalignment, if - then set 4 bytes + bbci a15, 2, _aligning_mod_4_check_1byte // branch if 2-nd bit unalignment a15 is clear + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + _aligning_mod_4_check_1byte: + + // Check modulo 2 and 1 + // modulo 2 and modulo 1 requires the same action + bbci a15, 1, _aligning_mod_2_check_1byte // branch if 1-st bit unalignment a15 is clear + s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + _aligning_mod_2_check_1byte: + + bbci a15, 0, _dest_buff_aligned_by_1byte // branch if 0-st bit unalignment a15 is clear + s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + _dest_buff_aligned_by_1byte: + + // Shift q reg, allowing to set 16-byte unaligned adata + wur.sar_byte a15 // apply unalignment to the SAR_BYTE + ee.src.q q2, q0, q1 // shift concat. of q0 and q1 to q2 by SAR_BYTE amount + + // Calculate main loop_len + srli a9, a12, 4 // a9 - loop_len = local_dest_w_bytes / 16 + + // Main loop + loopnez a9, ._main_loop_unaligned_by_1byte // 16 bytes (8 rgb565) in one loop + ee.vst.128.ip q2, a3, 16 // store 16 bytes from q0 to dest_buff a3 + ._main_loop_unaligned_by_1byte: + + // Firstly check mod 1 and mod 2 - correcting the aligned memory access + // Go back in one Byte, allow to correct after ee.vst.128.ip aligned access + addi a3, a3, -4 + + // Check modulo 2 of the dest_w, if - then set 2 bytes + // set SSSS in 0xSSSS0000 + bbci a12, 1, _aligned_mod_2_check_1byte_corr // branch if 1-st bit of dest_w a12 is clear + srli a14, a10, 16 // shift a10 in 16, allowing s16i (saving of lower 16 bits) + s16i a14, a3, 2 // save 16 bits from a10 to dest_buff a3, offset 2 bytes + + // Check modulo 1 of the dest_w, if - then set 1 byte + // additionally set SS in 0x0000SS00 + bbci a12, 0, _aligned_end // branch if 0-th bit of dest_w a12 is clear + srli a14, a10, 8 // shift a10 in 8, allowing s8i + s8i a14, a3, 1 // save 8 bits from a10 to dest_buff a3, offset 1 byte + j _aligned_end + _aligned_mod_2_check_1byte_corr: + + // Check modulo 1 of the dest_w, if - then set 1 byte + // set SS in 0xSS000000 + bbci a12, 0, _aligned_end // branch if 0-th bit of dest_w a12 is clear + srli a14, a10, 24 // shift a10 in 24, allowing s8i (saving of lower 8 bits) + s8i a14, a3, 3 // save 8 bits from a10 to dest_buff a3, offset 3 bytes + _aligned_end: + + addi a3, a3, 4 // Increase the pointer back, correction for addi a3, a3, -4 + + // Check modulo 8 of the dest_w, if - then set 8 bytes + bbci a12, 3, _aligned_mod_8_check_1byte // branch if 3-rd bit of local_dest_w_bytes a12 is clear + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 8 // increment dest_buff pointer by 4 bytes + _aligned_mod_8_check_1byte: + + // Check modulo 4 of the dest_w, if - then set 4 bytes + bbci a12, 2, _aligned_mod_4_check_1byte // branch if 2-nd bit of local_dest_w_bytes a12 is clear + s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + _aligned_mod_4_check_1byte: + + // Check modulo 2 of the dest_w, if - then set 2 bytes + bbci a12, 1, _aligned_mod_2_check_1byte // branch if 1-st bit of local_dest_w_bytes a12 is clear + s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + _aligned_mod_2_check_1byte: + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + bnez a5, .outer_loop_aligned_by_1byte + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return + + .lv_color_blend_to_rgb565_esp32_body: + movi.n a8, 0x3 // a8 = 0x3, dest_buff align mask sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes diff --git a/components/esp_lvgl_port/test_apps/simd/README.md b/components/esp_lvgl_port/test_apps/simd/README.md index 7c579c4f..d319e2e3 100644 --- a/components/esp_lvgl_port/test_apps/simd/README.md +++ b/components/esp_lvgl_port/test_apps/simd/README.md @@ -4,6 +4,17 @@ Test app accommodates two types of tests: [`functionality test`](#Functionality- Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) component. Header file with the assembly function prototypes is provided into the LVGL using Kconfig option `LV_DRAW_SW_ASM_CUSTOM_INCLUDE` and can be found in the [`lvgl_port/include`](../../include/esp_lvgl_port_lv_blend.h) +## Benchmark results + +| Color format | Matrix size | Memory alignment | ASM version | ANSI C version | +| :----------- | :---------- | :--------------- | :------------- | :------------- | +| ARGB8888 | 128x128 | 16 byte | 0.327 | 1.600 | +| | 127x127 | 1 byte | 0.488 | 1.597 | +| RGB565 | 128x128 | 16 byte | 0.196 | 1.146 | +| | 127x127 | 1 byte | 0.497 | 1.124 | +* this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x127 1 byte aligned matrix (worst case) +* the values represent cycles per sample to perform simple fill of the matrix on esp32s3 + ## Functionality test * Tests, whether the HW accelerated assembly version of an LVGL function provides the same results as the ANSI version * A top-level flow of the functionality test: diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c index 972f8edf..5bf29558 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c @@ -126,9 +126,9 @@ TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]") TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]") { test_matrix_params_t test_matrix = { - .min_w = 8, // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed + .min_w = 16, // 16 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed .min_h = 1, - .max_w = 16, + .max_w = 32, .max_h = 16, .min_unalign_byte = 0, .max_unalign_byte = 16,