Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature(lvgl_port): Add SIMD RGB565 support #438

Merged
merged 1 commit into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,6 @@ lv_color_blend_to_argb8888_esp:
// dest_w (a4) - 4-byte multiple


mov a13, a3

ee.zero.q q1 // clear q1
ee.orq q1, q1, q0 // copy q0 to q1
sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes
Expand Down Expand Up @@ -242,7 +240,7 @@ lv_color_blend_to_argb8888_esp:
ee.vst.128.ip q2, a3, 16 // store 16 bytes from q0 to dest_buff a3
._main_loop_unaligned_by_1byte:

// Firstly check mod 0 and mod 1 - correcting the aligned memory access
// Firstly check mod 1 and mod 2 - correcting the aligned memory access
// Go back in one Byte, allow to correct after ee.vst.128.ip aligned access
addi a3, a3, -4

Expand Down
pborcin marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* SPDX-License-Identifier: Apache-2.0
*/

// This is LVGL RGB565 simple fill for ESP32 processor
// This is LVGL RGB565 simple fill for ESP32S3 processor

.section .text
.align 4
Expand All @@ -31,7 +31,8 @@

lv_color_blend_to_rgb565_esp:

entry a1, 32
entry a1, 32
ee.zero.q q0 // dummy TIE instruction, to enable the TIE

l32i.n a3, a2, 4 // a3 - dest_buff
l32i.n a4, a2, 8 // a4 - dest_w in uint16_t
Expand Down Expand Up @@ -63,6 +64,261 @@ lv_color_blend_to_rgb565_esp:
and a10, a10, a13
or a10, a10, a12 // a10 = 32-bit color (16bit + (16bit << 16))

// Check for short lengths
// dest_w should be at least 16, othewise it's not worth using esp32s3 TIE
bgei a4, 16, _esp32s3_implementation // Branch if dest_w is greater than or equal to 16
pborcin marked this conversation as resolved.
Show resolved Hide resolved
j .lv_color_blend_to_rgb565_esp32_body // Jump to esp32 implementation

_esp32s3_implementation:

ee.movi.32.q q0, a10, 0 // fill q0 register from a10 by 32 bits
ee.movi.32.q q0, a10, 1
ee.movi.32.q q0, a10, 2
ee.movi.32.q q0, a10, 3

// Check dest_buff alignment
movi.n a7, 0xf // 0xf alignment mask (16-byte alignment)
and a15, a7, a3 // 16-byte alignment mask AND dest_buff pointer
bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero

// Check dest_stride alignment
and a15, a7, a6 // 16-byte alignment mask AND dest_stride
bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero

// Check dest_w_bytes alignment
and a15, a7, a11 // 16-byte alignment mask AND dest_w_bytes
bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero

//**********************************************************************************************************************

// all aligned, the most ideal case

// dest_buff (a3) - 16-byte aligned
// dest_stride (a6) - 16-byte multiple
// dest_w (a4) - 16-byte multiple

srli a9, a4, 3 // a9 - loop_len = dest_w / 8
sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes

.outer_loop_aligned:

loopnez a9, ._main_loop_aligned // 16 bytes (8 rgb565) in one loop
ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3
._main_loop_aligned:

add a3, a3, a6 // dest_buff + dest_stride
addi.n a5, a5, -1 // decrease the outer loop
bnez a5, .outer_loop_aligned

movi.n a2, 1 // return LV_RESULT_OK = 1
retw.n // return

_unaligned_by_4byte:

// Check dest_buff alignment
movi.n a7, 0x3 // 0x3 alignment mask (4-byte alignment)
and a15, a7, a3 // 4-byte alignment mask AND dest_buff pointer
bnez a15, _unaligned_by_1byte // branch if a15 not equals to zero

// Check dest_stride alignment
and a15, a7, a6 // 4-byte alignment mask AND dest_stride pointer
bnez a15, _unaligned_by_1byte // branch if a15 not equals to zero

//**********************************************************************************************************************

// either dest_buff or dest_stride is not 16-byte aligned
// dest_w is always 4-byte multiple
// all of the following are 4-byte aligned

// dest_buff (a3) - 16-byte, or 4-byte aligned
// dest_stride (a6) - 16-byte, or 4-byte multiple
// dest_w (a4) - 4-byte multiple

sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes
movi.n a7, 0xf // 0xf alignment mask

.outer_loop_aligned_by_4byte:

// alignment check
and a15, a7, a3 // 0xf (alignment mask) AND dest_buff pointer
mov a12, a11 // a12 - local_dest_w_bytes = dest_w_bytes
beqz a15, _dest_buff_aligned_by_4byte // branch if a15 equals to zero


movi.n a14, 16 // a14 - 16
sub a15, a14, a15 // a15 = 16 - unalignment (lower 4 bits of dest_buff address)
sub a12, a12, a15 // local_dest_w_bytes = len - (16 - unalignment)

// keep setting until dest_buff is aligned
// Check modulo 8 of the unalignment, if - then set 8 bytes
bbci a15, 3, _aligning_mod_8_check_4byte // branch if 3-rd bit of unalignment a15 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 4 bytes
addi.n a3, a3, 8 // increment dest_buff pointer by 8 bytes
_aligning_mod_8_check_4byte:

// Check modulo 4 of the unalignment, if - then set 4 bytes
bbci a15, 2, _aligning_mod_4_check_4byte // branch if 2-nd bit unalignment a15 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes
_aligning_mod_4_check_4byte:

_dest_buff_aligned_by_4byte:
// Calculate main loop_len
srli a9, a12, 4 // a9 - loop_len = local_dest_w_bytes / 16

// Main loop
loopnez a9, ._main_loop_unaligned_by_4byte // 16 bytes (8 rgb565) in one loop
ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3
._main_loop_unaligned_by_4byte:

// Check modulo 8 of the dest_w, if - then set 8 bytes
bbci a12, 3, _aligned_mod_8_check_4byte // branch if 3-rd bit of local_dest_w_bytes a12 is clear
ee.vst.l.64.ip q0, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
_aligned_mod_8_check_4byte:

// Check modulo 4 of the dest_w, if - then set 4 bytes
bbci a12, 2, _aligned_mod_4_check_4byte // branch if 2-nd bit of local_dest_w_bytes a12 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes
_aligned_mod_4_check_4byte:

// Check modulo 2 of the dest_w, if - then set 2 bytes
bbci a12, 1, _aligned_mod_2_check_4byte // branch if 1-st bit of local_dest_w_bytes a12 is clear
s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes
_aligned_mod_2_check_4byte:


add a3, a3, a6 // dest_buff + dest_stride
addi.n a5, a5, -1 // decrease the outer loop
bnez a5, .outer_loop_aligned_by_4byte

movi.n a2, 1 // return LV_RESULT_OK = 1
retw.n // return

_unaligned_by_1byte:

//**********************************************************************************************************************

// either dest_buff or dest_stride is not 4-byte aligned
// dest_w is always 4-byte multiple

// dest_buff (a3) - 4-byte, or 1-byte aligned
// dest_stride (a6) - 4-byte, or 1-byte multiple
// dest_w (a4) - 4-byte multiple


ee.zero.q q1 // clear q1
ee.orq q1, q1, q0 // copy q0 to q1
sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes
movi.n a7, 0xf // 0xf alignment mask

.outer_loop_aligned_by_1byte:

// alignment check
and a15, a7, a3 // 0xf (alignment mask) AND dest_buff pointer
mov a12, a11 // a12 - local_dest_w_bytes = dest_w_bytes
beqz a15, _dest_buff_aligned_by_1byte // branch if a15 equals to zero


movi.n a14, 16 // a14 - 16
sub a15, a14, a15 // a15 = 16 - unalignment (lower 4 bits of dest_buff address)
sub a12, a12, a15 // local_dest_w_bytes = len - (16 - unalignment)

// keep setting until dest_buff is aligned
// Check modulo 8 of the unalignment, if - then set 8 bytes
bbci a15, 3, _aligning_mod_8_check_1byte// branch if 3-rd bit of unalignment a15 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 4 bytes
addi.n a3, a3, 8 // increment dest_buff pointer by 8 bytes
_aligning_mod_8_check_1byte:

// Check modulo 4 of the unalignment, if - then set 4 bytes
bbci a15, 2, _aligning_mod_4_check_1byte // branch if 2-nd bit unalignment a15 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes
_aligning_mod_4_check_1byte:

// Check modulo 2 and 1
// modulo 2 and modulo 1 requires the same action
bbci a15, 1, _aligning_mod_2_check_1byte // branch if 1-st bit unalignment a15 is clear
s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes
_aligning_mod_2_check_1byte:

bbci a15, 0, _dest_buff_aligned_by_1byte // branch if 0-st bit unalignment a15 is clear
s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes
_dest_buff_aligned_by_1byte:

// Shift q reg, allowing to set 16-byte unaligned adata
wur.sar_byte a15 // apply unalignment to the SAR_BYTE
ee.src.q q2, q0, q1 // shift concat. of q0 and q1 to q2 by SAR_BYTE amount

// Calculate main loop_len
srli a9, a12, 4 // a9 - loop_len = local_dest_w_bytes / 16

// Main loop
loopnez a9, ._main_loop_unaligned_by_1byte // 16 bytes (8 rgb565) in one loop
ee.vst.128.ip q2, a3, 16 // store 16 bytes from q0 to dest_buff a3
._main_loop_unaligned_by_1byte:

// Firstly check mod 1 and mod 2 - correcting the aligned memory access
// Go back in one Byte, allow to correct after ee.vst.128.ip aligned access
addi a3, a3, -4

// Check modulo 2 of the dest_w, if - then set 2 bytes
// set SSSS in 0xSSSS0000
bbci a12, 1, _aligned_mod_2_check_1byte_corr // branch if 1-st bit of dest_w a12 is clear
srli a14, a10, 16 // shift a10 in 16, allowing s16i (saving of lower 16 bits)
s16i a14, a3, 2 // save 16 bits from a10 to dest_buff a3, offset 2 bytes

// Check modulo 1 of the dest_w, if - then set 1 byte
// additionally set SS in 0x0000SS00
bbci a12, 0, _aligned_end // branch if 0-th bit of dest_w a12 is clear
srli a14, a10, 8 // shift a10 in 8, allowing s8i
s8i a14, a3, 1 // save 8 bits from a10 to dest_buff a3, offset 1 byte
j _aligned_end
_aligned_mod_2_check_1byte_corr:

// Check modulo 1 of the dest_w, if - then set 1 byte
// set SS in 0xSS000000
bbci a12, 0, _aligned_end // branch if 0-th bit of dest_w a12 is clear
srli a14, a10, 24 // shift a10 in 24, allowing s8i (saving of lower 8 bits)
s8i a14, a3, 3 // save 8 bits from a10 to dest_buff a3, offset 3 bytes
_aligned_end:

addi a3, a3, 4 // Increase the pointer back, correction for addi a3, a3, -4

// Check modulo 8 of the dest_w, if - then set 8 bytes
bbci a12, 3, _aligned_mod_8_check_1byte // branch if 3-rd bit of local_dest_w_bytes a12 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 8 // increment dest_buff pointer by 4 bytes
_aligned_mod_8_check_1byte:

// Check modulo 4 of the dest_w, if - then set 4 bytes
bbci a12, 2, _aligned_mod_4_check_1byte // branch if 2-nd bit of local_dest_w_bytes a12 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes
_aligned_mod_4_check_1byte:

// Check modulo 2 of the dest_w, if - then set 2 bytes
bbci a12, 1, _aligned_mod_2_check_1byte // branch if 1-st bit of local_dest_w_bytes a12 is clear
s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes
_aligned_mod_2_check_1byte:

add a3, a3, a6 // dest_buff + dest_stride
addi.n a5, a5, -1 // decrease the outer loop
bnez a5, .outer_loop_aligned_by_1byte

movi.n a2, 1 // return LV_RESULT_OK = 1
retw.n // return

.lv_color_blend_to_rgb565_esp32_body:

movi.n a8, 0x3 // a8 = 0x3, dest_buff align mask
sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes

Expand Down
11 changes: 11 additions & 0 deletions components/esp_lvgl_port/test_apps/simd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@ Test app accommodates two types of tests: [`functionality test`](#Functionality-

Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) component. Header file with the assembly function prototypes is provided into the LVGL using Kconfig option `LV_DRAW_SW_ASM_CUSTOM_INCLUDE` and can be found in the [`lvgl_port/include`](../../include/esp_lvgl_port_lv_blend.h)

## Benchmark results
pborcin marked this conversation as resolved.
Show resolved Hide resolved

| Color format | Matrix size | Memory alignment | ASM version | ANSI C version |
| :----------- | :---------- | :--------------- | :------------- | :------------- |
| ARGB8888 | 128x128 | 16 byte | 0.327 | 1.600 |
| | 127x127 | 1 byte | 0.488 | 1.597 |
| RGB565 | 128x128 | 16 byte | 0.196 | 1.146 |
| | 127x127 | 1 byte | 0.497 | 1.124 |
* this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x127 1 byte aligned matrix (worst case)
* the values represent cycles per sample to perform simple fill of the matrix on esp32s3

## Functionality test
* Tests, whether the HW accelerated assembly version of an LVGL function provides the same results as the ANSI version
* A top-level flow of the functionality test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@ TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]")
TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]")
{
test_matrix_params_t test_matrix = {
.min_w = 8, // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
.min_w = 16, // 16 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
.min_h = 1,
.max_w = 16,
.max_w = 32,
.max_h = 16,
.min_unalign_byte = 0,
.max_unalign_byte = 16,
Expand Down