From 1a4a89b090f9e3eb234c29fa227bbba2df9f2438 Mon Sep 17 00:00:00 2001
From: pborcin <palec52@gmail.com>
Date: Fri, 8 Nov 2024 16:21:57 +0100
Subject: [PATCH] add rgb565 acceleration

---
 .../simd/lv_color_blend_to_argb8888_esp32s3.S |   4 +-
 .../simd/lv_color_blend_to_rgb565_esp32s3.S   | 260 +++++++++++++++++-
 .../esp_lvgl_port/test_apps/simd/README.md    |  11 +
 .../simd/main/test_lv_fill_functionality.c    |   4 +-
 4 files changed, 272 insertions(+), 7 deletions(-)

diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
index 4d9f84f1..bb3956e6 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
@@ -184,8 +184,6 @@ lv_color_blend_to_argb8888_esp:
     // dest_w      (a4) - 4-byte multiple
 
 
-    mov   a13, a3
-
     ee.zero.q   q1                                      // clear q1 
     ee.orq      q1,    q1,   q0                         // copy q0 to q1
     sub         a6,    a6,   a11                        // dest_stride = dest_stride - dest_w_bytes
@@ -242,7 +240,7 @@ lv_color_blend_to_argb8888_esp:
             ee.vst.128.ip q2, a3, 16                    // store 16 bytes from q0 to dest_buff a3
         ._main_loop_unaligned_by_1byte:
 
-        // Firstly check mod 0 and mod 1 - correcting the aligned memory access
+        // Firstly check mod 1 and mod 2 - correcting the aligned memory access
         // Go back in one Byte, allow to correct after ee.vst.128.ip aligned access
         addi    a3, a3, -4
 
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
index 07b5aa11..ee9f8a9c 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
@@ -4,7 +4,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
-// This is LVGL RGB565 simple fill for ESP32 processor
+// This is LVGL RGB565 simple fill for ESP32S3 processor
 
     .section .text
     .align  4
@@ -31,7 +31,8 @@
 
 lv_color_blend_to_rgb565_esp:
 
-    entry   a1,    32
+    entry      a1,    32
+    ee.zero.q  q0                               // dummy TIE instruction, to enable the TIE
 
     l32i.n   a3,    a2,    4                    // a3 - dest_buff
     l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
@@ -63,6 +64,261 @@ lv_color_blend_to_rgb565_esp:
     and     a10,    a10,    a13
     or      a10,    a10,    a12                 // a10 = 32-bit color (16bit + (16bit << 16))
 
+    // Check for short lengths
+    // dest_w should be at least 16, othewise it's not worth using esp32s3 TIE
+    bgei     a4,   16,  _esp32s3_implementation         // Branch if dest_w is greater than or equal to 16
+    j .lv_color_blend_to_rgb565_esp32_body              // Jump to esp32 implementation
+
+    _esp32s3_implementation:
+
+    ee.movi.32.q   q0,   a10,  0                        // fill q0 register from a10 by 32 bits
+    ee.movi.32.q   q0,   a10,  1
+    ee.movi.32.q   q0,   a10,  2
+    ee.movi.32.q   q0,   a10,  3
+
+    // Check dest_buff alignment
+    movi.n   a7,   0xf                                  // 0xf alignment mask (16-byte alignment)
+    and     a15,   a7,  a3                              // 16-byte alignment mask AND dest_buff pointer
+    bnez    a15,   _unaligned_by_4byte                  // branch if a15 not equals to zero
+
+    // Check dest_stride alignment
+    and     a15,   a7,  a6                              // 16-byte alignment mask AND dest_stride
+    bnez    a15,   _unaligned_by_4byte                  // branch if a15 not equals to zero
+
+    // Check dest_w_bytes alignment
+    and     a15,   a7,  a11                             // 16-byte alignment mask AND dest_w_bytes
+    bnez    a15,   _unaligned_by_4byte                  // branch if a15 not equals to zero
+
+//**********************************************************************************************************************
+
+    // all aligned, the most ideal case
+
+    // dest_buff   (a3) - 16-byte aligned
+    // dest_stride (a6) - 16-byte multiple
+    // dest_w      (a4) - 16-byte multiple
+
+    srli    a9,    a4,   3                              // a9 - loop_len = dest_w / 8
+    sub     a6,    a6,   a11                            // dest_stride = dest_stride - dest_w_bytes
+
+    .outer_loop_aligned:
+
+        loopnez  a9, ._main_loop_aligned                // 16 bytes (8 rgb565) in one loop
+            ee.vst.128.ip q0, a3, 16                    // store 16 bytes from q0 to dest_buff a3
+        ._main_loop_aligned:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_aligned
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+    _unaligned_by_4byte:
+
+    // Check dest_buff alignment
+    movi.n   a7,    0x3                                 // 0x3 alignment mask (4-byte alignment)
+    and     a15,    a7,   a3                            // 4-byte alignment mask AND dest_buff pointer
+    bnez    a15,    _unaligned_by_1byte                 // branch if a15 not equals to zero
+
+    // Check dest_stride alignment
+    and     a15,    a7,   a6                            // 4-byte alignment mask AND dest_stride pointer
+    bnez    a15,    _unaligned_by_1byte                 // branch if a15 not equals to zero
+
+//**********************************************************************************************************************
+
+    // either dest_buff or dest_stride is not 16-byte aligned
+    // dest_w is always 4-byte multiple
+    // all of the following are 4-byte aligned
+
+    // dest_buff   (a3) - 16-byte, or 4-byte aligned
+    // dest_stride (a6) - 16-byte, or 4-byte multiple
+    // dest_w      (a4) - 4-byte multiple
+
+    sub      a6,    a6,   a11                           // dest_stride = dest_stride - dest_w_bytes
+    movi.n   a7,    0xf                                 // 0xf alignment mask
+
+    .outer_loop_aligned_by_4byte:
+
+        // alignment check
+        and     a15,   a7,  a3                          // 0xf (alignment mask) AND dest_buff pointer
+        mov     a12,   a11                              // a12 - local_dest_w_bytes = dest_w_bytes
+        beqz    a15,   _dest_buff_aligned_by_4byte       // branch if a15 equals to zero
+
+
+            movi.n  a14,   16                           // a14 - 16
+            sub     a15,   a14,   a15                   // a15 = 16 - unalignment (lower 4 bits of dest_buff address)
+            sub     a12,   a12,   a15                   // local_dest_w_bytes = len - (16 - unalignment)
+
+            // keep setting until dest_buff is aligned
+            // Check modulo 8 of the unalignment, if - then set 8 bytes
+            bbci    a15,  3, _aligning_mod_8_check_4byte // branch if 3-rd bit of unalignment a15 is clear
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                s32i.n      a10,  a3,  4                // save 32 bits from a10 to dest_buff a3, offset 4 bytes
+                addi.n      a3,   a3,  8                // increment dest_buff pointer by 8 bytes
+            _aligning_mod_8_check_4byte:
+
+            // Check modulo 4 of the unalignment, if - then set 4 bytes
+            bbci a15, 2, _aligning_mod_4_check_4byte     // branch if 2-nd bit unalignment a15 is clear
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                addi.n      a3,   a3,  4                // increment dest_buff pointer by 4 bytes
+            _aligning_mod_4_check_4byte:
+
+        _dest_buff_aligned_by_4byte:
+        // Calculate main loop_len
+        srli    a9,    a12,   4                         // a9 - loop_len = local_dest_w_bytes / 16
+
+        // Main loop
+        loopnez  a9, ._main_loop_unaligned_by_4byte     // 16 bytes (8 rgb565) in one loop
+            ee.vst.128.ip q0, a3, 16                    // store 16 bytes from q0 to dest_buff a3
+        ._main_loop_unaligned_by_4byte:
+
+        // Check modulo 8 of the dest_w, if - then set 8 bytes
+        bbci a12, 3, _aligned_mod_8_check_4byte         // branch if 3-rd bit of local_dest_w_bytes a12 is clear
+            ee.vst.l.64.ip    q0,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+        _aligned_mod_8_check_4byte:
+
+        // Check modulo 4 of the dest_w, if - then set 4 bytes
+        bbci a12, 2, _aligned_mod_4_check_4byte         // branch if 2-nd bit of local_dest_w_bytes a12 is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+        _aligned_mod_4_check_4byte:
+
+        // Check modulo 2 of the dest_w, if - then set 2 bytes
+        bbci a12, 1, _aligned_mod_2_check_4byte         // branch if 1-st bit of local_dest_w_bytes a12 is clear
+            s16i        a10,  a3,  0                    // save 16 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+        _aligned_mod_2_check_4byte:
+
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_aligned_by_4byte
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+    _unaligned_by_1byte:
+
+//**********************************************************************************************************************
+
+    // either dest_buff or dest_stride is not 4-byte aligned
+    // dest_w is always 4-byte multiple
+
+    // dest_buff   (a3) - 4-byte, or 1-byte aligned
+    // dest_stride (a6) - 4-byte, or 1-byte multiple
+    // dest_w      (a4) - 4-byte multiple
+
+
+    ee.zero.q   q1                                      // clear q1 
+    ee.orq      q1,    q1,   q0                         // copy q0 to q1
+    sub         a6,    a6,   a11                        // dest_stride = dest_stride - dest_w_bytes
+    movi.n      a7,    0xf                              // 0xf alignment mask
+
+    .outer_loop_aligned_by_1byte:
+
+        // alignment check
+        and     a15,   a7,  a3                          // 0xf (alignment mask) AND dest_buff pointer
+        mov     a12,   a11                              // a12 - local_dest_w_bytes = dest_w_bytes
+        beqz    a15,   _dest_buff_aligned_by_1byte      // branch if a15 equals to zero
+
+
+            movi.n  a14,   16                           // a14 - 16
+            sub     a15,   a14,   a15                   // a15 = 16 - unalignment (lower 4 bits of dest_buff address)
+            sub     a12,   a12,   a15                   // local_dest_w_bytes = len - (16 - unalignment)
+
+            // keep setting until dest_buff is aligned
+            // Check modulo 8 of the unalignment, if - then set 8 bytes
+            bbci    a15,  3, _aligning_mod_8_check_1byte// branch if 3-rd bit of unalignment a15 is clear
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                s32i.n      a10,  a3,  4                // save 32 bits from a10 to dest_buff a3, offset 4 bytes
+                addi.n      a3,   a3,  8                // increment dest_buff pointer by 8 bytes
+            _aligning_mod_8_check_1byte:
+
+            // Check modulo 4 of the unalignment, if - then set 4 bytes
+            bbci a15, 2, _aligning_mod_4_check_1byte    // branch if 2-nd bit unalignment a15 is clear
+                s32i.n      a10,  a3,  0                // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+                addi.n      a3,   a3,  4                // increment dest_buff pointer by 4 bytes
+            _aligning_mod_4_check_1byte:
+
+            // Check modulo 2 and 1
+            // modulo 2 and modulo 1 requires the same action
+            bbci a15, 1, _aligning_mod_2_check_1byte    // branch if 1-st bit unalignment a15 is clear
+                s16i        a10,  a3,  0                // save 16 bits from a10 to dest_buff a3, offset 0 bytes
+                addi.n      a3,   a3,  2                // increment dest_buff pointer by 2 bytes
+            _aligning_mod_2_check_1byte:
+
+            bbci a15, 0, _dest_buff_aligned_by_1byte    // branch if 0-st bit unalignment a15 is clear
+                s16i        a10,  a3,  0                // save 16 bits from a10 to dest_buff a3, offset 0 bytes
+                addi.n      a3,   a3,  2                // increment dest_buff pointer by 2 bytes
+        _dest_buff_aligned_by_1byte:
+
+        // Shift q reg, allowing to set 16-byte unaligned adata
+        wur.sar_byte     a15                            // apply unalignment to the SAR_BYTE
+        ee.src.q   q2,  q0,  q1                         // shift concat. of q0 and q1 to q2 by SAR_BYTE amount
+
+        // Calculate main loop_len
+        srli    a9,    a12,   4                         // a9 - loop_len = local_dest_w_bytes / 16
+
+        // Main loop
+        loopnez  a9, ._main_loop_unaligned_by_1byte     // 16 bytes (8 rgb565) in one loop
+            ee.vst.128.ip q2, a3, 16                    // store 16 bytes from q0 to dest_buff a3
+        ._main_loop_unaligned_by_1byte:
+
+        // Firstly check mod 1 and mod 2 - correcting the aligned memory access
+        // Go back in one Byte, allow to correct after ee.vst.128.ip aligned access
+        addi    a3, a3, -4
+
+        // Check modulo 2 of the dest_w, if - then set 2 bytes
+        // set SSSS in 0xSSSS0000
+        bbci a12, 1, _aligned_mod_2_check_1byte_corr    // branch if 1-st bit of dest_w a12 is clear
+            srli    a14,   a10,  16                     // shift a10 in 16, allowing s16i (saving of lower 16 bits)
+            s16i    a14,   a3,   2                      // save 16 bits from a10 to dest_buff a3, offset 2 bytes
+
+            // Check modulo 1 of the dest_w, if - then set 1 byte
+            // additionally set SS in 0x0000SS00
+            bbci a12, 0, _aligned_end                   // branch if 0-th bit of dest_w a12 is clear
+                srli    a14,   a10,  8                  // shift a10 in 8, allowing s8i
+                s8i     a14,   a3,   1                  // save 8 bits from a10 to dest_buff a3, offset 1 byte
+                j _aligned_end
+        _aligned_mod_2_check_1byte_corr:
+
+        // Check modulo 1 of the dest_w, if - then set 1 byte
+        // set SS in 0xSS000000
+        bbci a12, 0, _aligned_end                       // branch if 0-th bit of dest_w a12 is clear
+            srli    a14,   a10,  24                     // shift a10 in 24, allowing s8i (saving of lower 8 bits)
+            s8i     a14,   a3,   3                      // save 8 bits from a10 to dest_buff a3, offset 3 bytes
+        _aligned_end:
+
+        addi    a3, a3, 4                               // Increase the pointer back, correction for addi    a3, a3, -4
+
+        // Check modulo 8 of the dest_w, if - then set 8 bytes
+        bbci a12, 3, _aligned_mod_8_check_1byte         // branch if 3-rd bit of local_dest_w_bytes a12 is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            s32i.n      a10,  a3,  4                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  8                    // increment dest_buff pointer by 4 bytes
+        _aligned_mod_8_check_1byte:
+
+        // Check modulo 4 of the dest_w, if - then set 4 bytes
+        bbci a12, 2, _aligned_mod_4_check_1byte         // branch if 2-nd bit of local_dest_w_bytes a12 is clear
+            s32i.n      a10,  a3,  0                    // save 32 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+        _aligned_mod_4_check_1byte:
+
+        // Check modulo 2 of the dest_w, if - then set 2 bytes
+        bbci a12, 1, _aligned_mod_2_check_1byte         // branch if 1-st bit of local_dest_w_bytes a12 is clear
+            s16i        a10,  a3,  0                    // save 16 bits from a10 to dest_buff a3, offset 0 bytes
+            addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+        _aligned_mod_2_check_1byte:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_aligned_by_1byte
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+    .lv_color_blend_to_rgb565_esp32_body:
+
     movi.n  a8,    0x3                          // a8 = 0x3, dest_buff align mask
     sub     a6,    a6,   a11                    // dest_stride = dest_stride - dest_w_bytes
 
diff --git a/components/esp_lvgl_port/test_apps/simd/README.md b/components/esp_lvgl_port/test_apps/simd/README.md
index 7c579c4f..d319e2e3 100644
--- a/components/esp_lvgl_port/test_apps/simd/README.md
+++ b/components/esp_lvgl_port/test_apps/simd/README.md
@@ -4,6 +4,17 @@ Test app accommodates two types of tests: [`functionality test`](#Functionality-
 
 Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) component. Header file with the assembly function prototypes is provided into the LVGL using Kconfig option `LV_DRAW_SW_ASM_CUSTOM_INCLUDE` and can be found in the [`lvgl_port/include`](../../include/esp_lvgl_port_lv_blend.h)
 
+## Benchmark results
+
+| Color format | Matrix size | Memory alignment |  ASM version   | ANSI C version |
+| :----------- | :---------- | :--------------- | :------------- | :------------- |
+| ARGB8888     | 128x128     |     16 byte      |     0.327      |     1.600      |
+|              | 127x127     |      1 byte      |     0.488      |     1.597      |
+| RGB565       | 128x128     |     16 byte      |     0.196      |     1.146      |
+|              | 127x127     |      1 byte      |     0.497      |     1.124      |
+* this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x127 1 byte aligned matrix (worst case)
+* the values represent cycles per sample to perform simple fill of the matrix on esp32s3
+
 ## Functionality test
 * Tests, whether the HW accelerated assembly version of an LVGL function provides the same results as the ANSI version
 * A top-level flow of the functionality test:
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
index 972f8edf..5bf29558 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
@@ -126,9 +126,9 @@ TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]")
 TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]")
 {
     test_matrix_params_t test_matrix = {
-        .min_w = 8,             // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
+        .min_w = 16,            // 16 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
         .min_h = 1,
-        .max_w = 16,
+        .max_w = 32,
         .max_h = 16,
         .min_unalign_byte = 0,
         .max_unalign_byte = 16,