Skip to content

Commit

Permalink
add rgb565 acceleration
Browse files Browse the repository at this point in the history
  • Loading branch information
pborcin committed Nov 27, 2024
1 parent 1795f91 commit 55281ea
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,6 @@ lv_color_blend_to_argb8888_esp:
// dest_w (a4) - 4-byte multiple


mov a13, a3

ee.zero.q q1 // clear q1
ee.orq q1, q1, q0 // copy q0 to q1
sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes
Expand Down Expand Up @@ -242,7 +240,7 @@ lv_color_blend_to_argb8888_esp:
ee.vst.128.ip q2, a3, 16 // store 16 bytes from q0 to dest_buff a3
._main_loop_unaligned_by_1byte:

// Firstly check mod 0 and mod 1 - correcting the aligned memory access
// Firstly check mod 1 and mod 2 - correcting the aligned memory access
// Go back in one Byte, allow to correct after ee.vst.128.ip aligned access
addi a3, a3, -4

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* SPDX-License-Identifier: Apache-2.0
*/

// This is LVGL RGB565 simple fill for ESP32 processor
// This is LVGL RGB565 simple fill for ESP32S3 processor

.section .text
.align 4
Expand All @@ -31,7 +31,8 @@

lv_color_blend_to_rgb565_esp:

entry a1, 32
entry a1, 32
ee.zero.q q0 // dummy TIE instruction, to enable the TIE

l32i.n a3, a2, 4 // a3 - dest_buff
l32i.n a4, a2, 8 // a4 - dest_w in uint16_t
Expand Down Expand Up @@ -63,6 +64,261 @@ lv_color_blend_to_rgb565_esp:
and a10, a10, a13
or a10, a10, a12 // a10 = 32-bit color (16bit + (16bit << 16))

// Check for short lengths
// dest_w should be at least 16, othewise it's not worth using esp32s3 TIE
bgei a4, 16, _esp32s3_implementation // Branch if dest_w is greater than or equal to 16
j .lv_color_blend_to_rgb565_esp32_body // Jump to esp32 implementation

_esp32s3_implementation:

ee.movi.32.q q0, a10, 0 // fill q0 register from a10 by 32 bits
ee.movi.32.q q0, a10, 1
ee.movi.32.q q0, a10, 2
ee.movi.32.q q0, a10, 3

// Check dest_buff alignment
movi.n a7, 0xf // 0xf alignment mask (16-byte alignment)
and a15, a7, a3 // 16-byte alignment mask AND dest_buff pointer
bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero

// Check dest_stride alignment
and a15, a7, a6 // 16-byte alignment mask AND dest_stride
bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero

// Check dest_w_bytes alignment
and a15, a7, a11 // 16-byte alignment mask AND dest_w_bytes
bnez a15, _unaligned_by_4byte // branch if a15 not equals to zero

//**********************************************************************************************************************

// all aligned, the most ideal case

// dest_buff (a3) - 16-byte aligned
// dest_stride (a6) - 16-byte multiple
// dest_w (a4) - 16-byte multiple

srli a9, a4, 3 // a9 - loop_len = dest_w / 8
sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes

.outer_loop_aligned:

loopnez a9, ._main_loop_aligned // 16 bytes (8 rgb565) in one loop
ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3
._main_loop_aligned:

add a3, a3, a6 // dest_buff + dest_stride
addi.n a5, a5, -1 // decrease the outer loop
bnez a5, .outer_loop_aligned

movi.n a2, 1 // return LV_RESULT_OK = 1
retw.n // return

_unaligned_by_4byte:

// Check dest_buff alignment
movi.n a7, 0x3 // 0x3 alignment mask (4-byte alignment)
and a15, a7, a3 // 4-byte alignment mask AND dest_buff pointer
bnez a15, _unaligned_by_1byte // branch if a15 not equals to zero

// Check dest_stride alignment
and a15, a7, a6 // 4-byte alignment mask AND dest_stride pointer
bnez a15, _unaligned_by_1byte // branch if a15 not equals to zero

//**********************************************************************************************************************

// either dest_buff or dest_stride is not 16-byte aligned
// dest_w is always 4-byte multiple
// all of the following are 4-byte aligned

// dest_buff (a3) - 16-byte, or 4-byte aligned
// dest_stride (a6) - 16-byte, or 4-byte multiple
// dest_w (a4) - 4-byte multiple

sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes
movi.n a7, 0xf // 0xf alignment mask

.outer_loop_aligned_by_4byte:

// alignment check
and a15, a7, a3 // 0xf (alignment mask) AND dest_buff pointer
mov a12, a11 // a12 - local_dest_w_bytes = dest_w_bytes
beqz a15, _dest_buff_aligned_by_4byte // branch if a15 equals to zero


movi.n a14, 16 // a14 - 16
sub a15, a14, a15 // a15 = 16 - unalignment (lower 4 bits of dest_buff address)
sub a12, a12, a15 // local_dest_w_bytes = len - (16 - unalignment)

// keep setting until dest_buff is aligned
// Check modulo 8 of the unalignment, if - then set 8 bytes
bbci a15, 3, _aligning_mod_8_check_4byte // branch if 3-rd bit of unalignment a15 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 4 bytes
addi.n a3, a3, 8 // increment dest_buff pointer by 8 bytes
_aligning_mod_8_check_4byte:

// Check modulo 4 of the unalignment, if - then set 4 bytes
bbci a15, 2, _aligning_mod_4_check_4byte // branch if 2-nd bit unalignment a15 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes
_aligning_mod_4_check_4byte:

_dest_buff_aligned_by_4byte:
// Calculate main loop_len
srli a9, a12, 4 // a9 - loop_len = local_dest_w_bytes / 16

// Main loop
loopnez a9, ._main_loop_unaligned_by_4byte // 16 bytes (8 rgb565) in one loop
ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3
._main_loop_unaligned_by_4byte:

// Check modulo 8 of the dest_w, if - then set 8 bytes
bbci a12, 3, _aligned_mod_8_check_4byte // branch if 3-rd bit of local_dest_w_bytes a12 is clear
ee.vst.l.64.ip q0, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
_aligned_mod_8_check_4byte:

// Check modulo 4 of the dest_w, if - then set 4 bytes
bbci a12, 2, _aligned_mod_4_check_4byte // branch if 2-nd bit of local_dest_w_bytes a12 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes
_aligned_mod_4_check_4byte:

// Check modulo 2 of the dest_w, if - then set 2 bytes
bbci a12, 1, _aligned_mod_2_check_4byte // branch if 1-st bit of local_dest_w_bytes a12 is clear
s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes
_aligned_mod_2_check_4byte:


add a3, a3, a6 // dest_buff + dest_stride
addi.n a5, a5, -1 // decrease the outer loop
bnez a5, .outer_loop_aligned_by_4byte

movi.n a2, 1 // return LV_RESULT_OK = 1
retw.n // return

_unaligned_by_1byte:

//**********************************************************************************************************************

// either dest_buff or dest_stride is not 4-byte aligned
// dest_w is always 4-byte multiple

// dest_buff (a3) - 4-byte, or 1-byte aligned
// dest_stride (a6) - 4-byte, or 1-byte multiple
// dest_w (a4) - 4-byte multiple


ee.zero.q q1 // clear q1
ee.orq q1, q1, q0 // copy q0 to q1
sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes
movi.n a7, 0xf // 0xf alignment mask

.outer_loop_aligned_by_1byte:

// alignment check
and a15, a7, a3 // 0xf (alignment mask) AND dest_buff pointer
mov a12, a11 // a12 - local_dest_w_bytes = dest_w_bytes
beqz a15, _dest_buff_aligned_by_1byte // branch if a15 equals to zero


movi.n a14, 16 // a14 - 16
sub a15, a14, a15 // a15 = 16 - unalignment (lower 4 bits of dest_buff address)
sub a12, a12, a15 // local_dest_w_bytes = len - (16 - unalignment)

// keep setting until dest_buff is aligned
// Check modulo 8 of the unalignment, if - then set 8 bytes
bbci a15, 3, _aligning_mod_8_check_1byte// branch if 3-rd bit of unalignment a15 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 4 bytes
addi.n a3, a3, 8 // increment dest_buff pointer by 8 bytes
_aligning_mod_8_check_1byte:

// Check modulo 4 of the unalignment, if - then set 4 bytes
bbci a15, 2, _aligning_mod_4_check_1byte // branch if 2-nd bit unalignment a15 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes
_aligning_mod_4_check_1byte:

// Check modulo 2 and 1
// modulo 2 and modulo 1 requires the same action
bbci a15, 1, _aligning_mod_2_check_1byte // branch if 1-st bit unalignment a15 is clear
s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes
_aligning_mod_2_check_1byte:

bbci a15, 0, _dest_buff_aligned_by_1byte // branch if 0-st bit unalignment a15 is clear
s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes
_dest_buff_aligned_by_1byte:

// Shift q reg, allowing to set 16-byte unaligned adata
wur.sar_byte a15 // apply unalignment to the SAR_BYTE
ee.src.q q2, q0, q1 // shift concat. of q0 and q1 to q2 by SAR_BYTE amount

// Calculate main loop_len
srli a9, a12, 4 // a9 - loop_len = local_dest_w_bytes / 16

// Main loop
loopnez a9, ._main_loop_unaligned_by_1byte // 16 bytes (8 rgb565) in one loop
ee.vst.128.ip q2, a3, 16 // store 16 bytes from q0 to dest_buff a3
._main_loop_unaligned_by_1byte:

// Firstly check mod 1 and mod 2 - correcting the aligned memory access
// Go back in one Byte, allow to correct after ee.vst.128.ip aligned access
addi a3, a3, -4

// Check modulo 2 of the dest_w, if - then set 2 bytes
// set SSSS in 0xSSSS0000
bbci a12, 1, _aligned_mod_2_check_1byte_corr // branch if 1-st bit of dest_w a12 is clear
srli a14, a10, 16 // shift a10 in 16, allowing s16i (saving of lower 16 bits)
s16i a14, a3, 2 // save 16 bits from a10 to dest_buff a3, offset 2 bytes

// Check modulo 1 of the dest_w, if - then set 1 byte
// additionally set SS in 0x0000SS00
bbci a12, 0, _aligned_end // branch if 0-th bit of dest_w a12 is clear
srli a14, a10, 8 // shift a10 in 8, allowing s8i
s8i a14, a3, 1 // save 8 bits from a10 to dest_buff a3, offset 1 byte
j _aligned_end
_aligned_mod_2_check_1byte_corr:

// Check modulo 1 of the dest_w, if - then set 1 byte
// set SS in 0xSS000000
bbci a12, 0, _aligned_end // branch if 0-th bit of dest_w a12 is clear
srli a14, a10, 24 // shift a10 in 24, allowing s8i (saving of lower 8 bits)
s8i a14, a3, 3 // save 8 bits from a10 to dest_buff a3, offset 3 bytes
_aligned_end:

addi a3, a3, 4 // Increase the pointer back, correction for addi a3, a3, -4

// Check modulo 8 of the dest_w, if - then set 8 bytes
bbci a12, 3, _aligned_mod_8_check_1byte // branch if 3-rd bit of local_dest_w_bytes a12 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
s32i.n a10, a3, 4 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 8 // increment dest_buff pointer by 4 bytes
_aligned_mod_8_check_1byte:

// Check modulo 4 of the dest_w, if - then set 4 bytes
bbci a12, 2, _aligned_mod_4_check_1byte // branch if 2-nd bit of local_dest_w_bytes a12 is clear
s32i.n a10, a3, 0 // save 32 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes
_aligned_mod_4_check_1byte:

// Check modulo 2 of the dest_w, if - then set 2 bytes
bbci a12, 1, _aligned_mod_2_check_1byte // branch if 1-st bit of local_dest_w_bytes a12 is clear
s16i a10, a3, 0 // save 16 bits from a10 to dest_buff a3, offset 0 bytes
addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes
_aligned_mod_2_check_1byte:

add a3, a3, a6 // dest_buff + dest_stride
addi.n a5, a5, -1 // decrease the outer loop
bnez a5, .outer_loop_aligned_by_1byte

movi.n a2, 1 // return LV_RESULT_OK = 1
retw.n // return

.lv_color_blend_to_rgb565_esp32_body:

movi.n a8, 0x3 // a8 = 0x3, dest_buff align mask
sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes

Expand Down
10 changes: 10 additions & 0 deletions components/esp_lvgl_port/test_apps/simd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ Test app accommodates two types of tests: [`functionality test`](#Functionality-

Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) component. Header file with the assembly function prototypes is provided into the LVGL using Kconfig option `LV_DRAW_SW_ASM_CUSTOM_INCLUDE` and can be found in the [`lvgl_port/include`](../../include/esp_lvgl_port_lv_blend.h)

## Benchmark results

| Color format | Matrix size | ANSI C version | ASM version |
| :----------- | :---------- | :------------- | :------------- |
| ARGB8888 | 128x128 | 1.600 | 0.327 |
| | 127x127 | 1.597 | 0.488 |
| RGB565 | 128x128 | 1.146 | 0.196 |
| | 127x127 | 1.124 | 0.497 |
* the values represent cycles per sample to perform simple fill of the matrix

## Functionality test
* Tests, whether the HW accelerated assembly version of an LVGL function provides the same results as the ANSI version
* A top-level flow of the functionality test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@ TEST_CASE("Test fill functionality ARGB8888", "[fill][functionality][ARGB8888]")
TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]")
{
test_matrix_params_t test_matrix = {
.min_w = 8, // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
.min_w = 16, // 16 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
.min_h = 1,
.max_w = 16,
.max_w = 32,
.max_h = 16,
.min_unalign_byte = 0,
.max_unalign_byte = 16,
Expand Down

0 comments on commit 55281ea

Please sign in to comment.