From 83f5264ee2b8b7b452b953ab6dc75e35f901571c Mon Sep 17 00:00:00 2001 From: weetrain <1026809857@qq.com> Date: Mon, 27 Dec 2021 12:51:13 +0800 Subject: [PATCH] pixel diff 8bit and 10bit --- src/armv8/arm64.c | 21 +- src/armv8/pixel_arm64.S | 844 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 858 insertions(+), 7 deletions(-) diff --git a/src/armv8/arm64.c b/src/armv8/arm64.c index f28e77e..fb87447 100644 --- a/src/armv8/arm64.c +++ b/src/armv8/arm64.c @@ -42,6 +42,13 @@ void uavs3e_funs_init_arm64() uavs3e_funs_handle.alf = uavs3e_alf_filter_block_arm64; + uavs3e_funs_handle.pel_diff[0] = uavs3e_pel_diff_4_arm64; + uavs3e_funs_handle.pel_diff[1] = uavs3e_pel_diff_8_arm64; + uavs3e_funs_handle.pel_diff[2] = uavs3e_pel_diff_16_arm64; + uavs3e_funs_handle.pel_diff[3] = uavs3e_pel_diff_32_arm64; + uavs3e_funs_handle.pel_diff[4] = uavs3e_pel_diff_64_arm64; + uavs3e_funs_handle.pel_diff[5] = uavs3e_pel_diff_128_arm64; + #else uavs3e_funs_handle.itrans_dct2[1][1] = uavs3e_itrans_dct2_h4_w4_arm64; uavs3e_funs_handle.itrans_dct2[1][2] = uavs3e_itrans_dct2_h4_w8_arm64; @@ -175,6 +182,13 @@ void uavs3e_funs_init_arm64() uavs3e_funs_handle.cost_satd[2][1] = uavs3e_had_16x8_arm64; uavs3e_funs_handle.cost_satd[1][2] = uavs3e_had_8x16_arm64; + uavs3e_funs_handle.pel_diff[0] = uavs3e_pel_diff_4_arm64; + uavs3e_funs_handle.pel_diff[1] = uavs3e_pel_diff_8_arm64; + uavs3e_funs_handle.pel_diff[2] = uavs3e_pel_diff_16_arm64; + uavs3e_funs_handle.pel_diff[3] = uavs3e_pel_diff_32_arm64; + uavs3e_funs_handle.pel_diff[4] = uavs3e_pel_diff_64_arm64; + uavs3e_funs_handle.pel_diff[5] = uavs3e_pel_diff_128_arm64; + /* uavs3e_funs_handle.cost_var[0] = uavs3e_get_var_4_arm64; uavs3e_funs_handle.cost_var[1] = uavs3e_get_var_8_arm64; @@ -188,13 +202,6 @@ void uavs3e_funs_init_arm64() uavs3e_funs_handle.sobel_cost = sobel_cost; - uavs3e_funs_handle.pel_diff[0] = uavs3e_pel_diff_4_arm64; - uavs3e_funs_handle.pel_diff[1] = uavs3e_pel_diff_8_arm64; - uavs3e_funs_handle.pel_diff[2] = uavs3e_pel_diff_16_arm64; - uavs3e_funs_handle.pel_diff[3] = uavs3e_pel_diff_32_arm64; - uavs3e_funs_handle.pel_diff[4] = uavs3e_pel_diff_64_arm64; - uavs3e_funs_handle.pel_diff[5] = uavs3e_pel_diff_128_arm64; - uavs3e_funs_handle.pel_avrg[0] = uavs3e_pel_avrg_4_arm64; uavs3e_funs_handle.pel_avrg[1] = uavs3e_pel_avrg_8_arm64; uavs3e_funs_handle.pel_avrg[2] = uavs3e_pel_avrg_16_arm64; diff --git a/src/armv8/pixel_arm64.S b/src/armv8/pixel_arm64.S index dae8ef5..a9ce46d 100644 --- a/src/armv8/pixel_arm64.S +++ b/src/armv8/pixel_arm64.S @@ -174,6 +174,490 @@ recon_w8_end: ret +/****************************************************************************************************** +* void uavs3e_pel_diff_4_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_4_arm64 + + lsl x5, x5, #1 + +diff_w4_loopx: + + //load *org + //ld4 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0], x1 + ld1 {v0.s}[0], [x0], x1 + ld1 {v0.s}[1], [x0], x1 + ld1 {v0.s}[2], [x0], x1 + ld1 {v0.s}[3], [x0], x1 + + //load *pred + //ld1 {v4.4h,v5.4h,v6.4h,v7.4h,}, [x2], x3 + ld1 {v1.s}[0], [x2], x3 + ld1 {v1.s}[1], [x2], x3 + ld1 {v1.s}[2], [x2], x3 + ld1 {v1.s}[3], [x2], x3 + + uxtl v2.8h, v0.8b + uxtl2 v3.8h, v0.16b + uxtl v4.8h, v1.8b + uxtl2 v5.8h, v1.16b + + //减 + sub v0.8h, v2.8h, v4.8h + sub v1.8h, v3.8h, v5.8h + +// ssubl v2.8h,v0.8b,v1.8b +// ssubl2 v3.8h,v0.16b,v1.16b + + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.d}[0], [x4], x5 + st1 {v0.d}[1], [x4], x5 + st1 {v1.d}[0], [x4], x5 + st1 {v1.d}[1], [x4], x5 + + +// //p_resi[i] = p_org[i] - p_pred[i] +// st1 {v2.d}[0], [x4], x5 +// st1 {v2.d}[1], [x4], x5 +// st1 {v3.d}[0], [x4], x5 +// st1 {v3.d}[1], [x4], x5 + + sub x6,x6,#4 + cmp x6,#0 + bgt diff_w4_loopx + + ret + + +/****************************************************************************************************** +* void uavs3e_pel_diff_8_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_8_arm64 + + lsl x5, x5, #1 + +diff_w8_loopx: + + //load *org + //ld4 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0], x1 + ld1 {v0.d}[0], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + ld1 {v2.d}[0], [x0], x1 + ld1 {v3.d}[0], [x0], x1 + + //load *pred + //ld1 {v4.4h,v5.4h,v6.4h,v7.4h,}, [x2], x3 + ld1 {v4.d}[0], [x2], x3 + ld1 {v5.d}[0], [x2], x3 + ld1 {v6.d}[0], [x2], x3 + ld1 {v7.d}[0], [x2], x3 + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v6.8h, v6.8b + uxtl v7.8h, v7.8b + + + //减 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v6.8h + sub v3.8h, v3.8h, v7.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h}, [x4], x5 + st1 {v1.8h}, [x4], x5 + st1 {v2.8h}, [x4], x5 + st1 {v3.8h}, [x4], x5 + + sub x6,x6,#4 + cmp x6,#0 + bgt diff_w8_loopx + + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_16_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_16_arm64 + + lsl x5, x5, #1 + +diff_w16_loopx: + + //load *org + //ld4 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0], x1 + ld1 {v0.8b,v1.8b}, [x0], x1 + ld1 {v2.8b,v3.8b}, [x0], x1 + ld1 {v4.8b,v5.8b}, [x0], x1 + ld1 {v6.8b,v7.8b}, [x0], x1 + + //load *pred + //ld1 {v4.4h,v5.4h,v6.4h,v7.4h,}, [x2], x3 + ld1 {v16.8b,v17.8b}, [x2], x3 + ld1 {v18.8b,v19.8b}, [x2], x3 + ld1 {v20.8b,v21.8b}, [x2], x3 + ld1 {v22.8b,v23.8b}, [x2], x3 + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v6.8h, v6.8b + uxtl v7.8h, v7.8b + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + uxtl v23.8h, v23.8b + + //减 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v21.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v23.8h + + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h,v1.8h}, [x4], x5 + st1 {v2.8h,v3.8h}, [x4], x5 + st1 {v4.8h,v5.8h}, [x4], x5 + st1 {v6.8h,v7.8h}, [x4], x5 + + sub x6,x6,#4 + cmp x6,#0 + bgt diff_w16_loopx + b recon_w16_end + +recon_w16_end: + + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_32_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_32_arm64 + + sub sp, sp, #64 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] + sub sp, sp, #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp] + + lsl x5, x5, #1 + +diff_w32_loopx: + + //load *org + //ld4 {v0.8h-v15.8h}, [x0], x1 + ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [x0], x1 + ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [x0], x1 + ld1 {v8.8b,v9.8b,v10.8b,v11.8b}, [x0], x1 + ld1 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], x1 + + //load *pred + //ld1 {v16.8h-v31.8h}, [x2], x3 + ld1 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], x3 + ld1 {v20.8b,v21.8b,v22.8b,v23.8b}, [x2], x3 + ld1 {v24.8b,v25.8b,v26.8b,v27.8b}, [x2], x3 + ld1 {v28.8b,v29.8b,v30.8b,v31.8b}, [x2], x3 + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v6.8h, v6.8b + uxtl v7.8h, v7.8b + uxtl v8.8h, v8.8b + uxtl v9.8h, v9.8b + uxtl v10.8h, v10.8b + uxtl v11.8h, v11.8b + uxtl v12.8h, v12.8b + uxtl v13.8h, v13.8b + uxtl v14.8h, v14.8b + uxtl v15.8h, v15.8b + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + uxtl v23.8h, v23.8b + uxtl v24.8h, v24.8b + uxtl v25.8h, v25.8b + uxtl v26.8h, v26.8b + uxtl v27.8h, v27.8b + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + uxtl v30.8h, v30.8b + uxtl v31.8h, v31.8b + + //减 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v21.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v23.8h + sub v8.8h, v8.8h, v24.8h + sub v9.8h, v9.8h, v25.8h + sub v10.8h, v10.8h, v26.8h + sub v11.8h, v11.8h, v27.8h + sub v12.8h, v12.8h, v28.8h + sub v13.8h, v13.8h, v29.8h + sub v14.8h, v14.8h, v30.8h + sub v15.8h, v15.8h, v31.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x4], x5 + st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x4], x5 + st1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x4], x5 + st1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x4], x5 + + sub x6,x6,#4 + cmp x6,#0 + bgt diff_w32_loopx + b diff_w32_end + +diff_w32_end: + + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_64_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_64_arm64 + + sub sp, sp, #64 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] + sub sp, sp, #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp] + + lsl x5, x5, #1 + + sub x1, x1, #32 + sub x3, x3, #32 + sub x5, x5, #64 + +diff_w64_loopx: + + //load *org + //ld4 {v0.8h-v15.8h}, [x0], x1 + ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [x0], #32 + ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [x0], x1 + ld1 {v8.8b,v9.8b,v10.8b,v11.8b}, [x0], #32 + ld1 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], x1 + + //load *pred + //ld1 {v16.8h-v31.8h}, [x2], x3 + ld1 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 + ld1 {v20.8b,v21.8b,v22.8b,v23.8b}, [x2], x3 + ld1 {v24.8b,v25.8b,v26.8b,v27.8b}, [x2], #32 + ld1 {v28.8b,v29.8b,v30.8b,v31.8b}, [x2], x3 + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v6.8h, v6.8b + uxtl v7.8h, v7.8b + uxtl v8.8h, v8.8b + uxtl v9.8h, v9.8b + uxtl v10.8h, v10.8b + uxtl v11.8h, v11.8b + uxtl v12.8h, v12.8b + uxtl v13.8h, v13.8b + uxtl v14.8h, v14.8b + uxtl v15.8h, v15.8b + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + uxtl v23.8h, v23.8b + uxtl v24.8h, v24.8b + uxtl v25.8h, v25.8b + uxtl v26.8h, v26.8b + uxtl v27.8h, v27.8b + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + uxtl v30.8h, v30.8b + uxtl v31.8h, v31.8b + + //减 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v21.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v23.8h + sub v8.8h, v8.8h, v24.8h + sub v9.8h, v9.8h, v25.8h + sub v10.8h, v10.8h, v26.8h + sub v11.8h, v11.8h, v27.8h + sub v12.8h, v12.8h, v28.8h + sub v13.8h, v13.8h, v29.8h + sub v14.8h, v14.8h, v30.8h + sub v15.8h, v15.8h, v31.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x4], #64 + st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x4], x5 + st1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x4], #64 + st1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x4], x5 + + sub x6,x6,#2 + cmp x6,#0 + bgt diff_w64_loopx + b diff_w64_end + +diff_w64_end: + + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_128_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_128_arm64 + + sub sp, sp, #64 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] + sub sp, sp, #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp] + + lsl x5, x5, #1 + + sub x1, x1, #96 + sub x3, x3, #96 + sub x5, x5, #192 + +diff_w128_loopx: + + //load *org + //ld4 {v0.8h-v15.8h}, [x0], x1 + ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [x0], #32 + ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [x0], #32 + ld1 {v8.8b,v9.8b,v10.8b,v11.8b}, [x0], #32 + ld1 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], x1 + + //load *pred + //ld1 {v16.8h-v31.8h}, [x2], x3 + ld1 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 + ld1 {v20.8b,v21.8b,v22.8b,v23.8b}, [x2], #32 + ld1 {v24.8b,v25.8b,v26.8b,v27.8b}, [x2], #32 + ld1 {v28.8b,v29.8b,v30.8b,v31.8b}, [x2], x3 + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + uxtl v4.8h, v4.8b + uxtl v5.8h, v5.8b + uxtl v6.8h, v6.8b + uxtl v7.8h, v7.8b + uxtl v8.8h, v8.8b + uxtl v9.8h, v9.8b + uxtl v10.8h, v10.8b + uxtl v11.8h, v11.8b + uxtl v12.8h, v12.8b + uxtl v13.8h, v13.8b + uxtl v14.8h, v14.8b + uxtl v15.8h, v15.8b + uxtl v16.8h, v16.8b + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + uxtl v20.8h, v20.8b + uxtl v21.8h, v21.8b + uxtl v22.8h, v22.8b + uxtl v23.8h, v23.8b + uxtl v24.8h, v24.8b + uxtl v25.8h, v25.8b + uxtl v26.8h, v26.8b + uxtl v27.8h, v27.8b + uxtl v28.8h, v28.8b + uxtl v29.8h, v29.8b + uxtl v30.8h, v30.8b + uxtl v31.8h, v31.8b + + //减 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v21.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v23.8h + sub v8.8h, v8.8h, v24.8h + sub v9.8h, v9.8h, v25.8h + sub v10.8h, v10.8h, v26.8h + sub v11.8h, v11.8h, v27.8h + sub v12.8h, v12.8h, v28.8h + sub v13.8h, v13.8h, v29.8h + sub v14.8h, v14.8h, v30.8h + sub v15.8h, v15.8h, v31.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x4], #64 + st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x4], #64 + st1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x4], #64 + st1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x4], x5 + + sub x6,x6,#1 + cmp x6,#0 + bgt diff_w128_loopx + b diff_w128_end + +diff_w128_end: + + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + + ret + #else /****************************************************************************************************** * void uavs3e_recon_w4_arm64(s16 *resi, pel *pred, int i_pred, int width, int height, pel *rec, int i_rec, int cbf, int bit_depth) @@ -552,6 +1036,366 @@ recon_w32_end: ret +/****************************************************************************************************** +* void uavs3e_pel_diff_4_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_4_arm64 + + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + +diff_w4_loopx: + + //load *org + //ld4 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0], x1 + ld1 {v0.4h}, [x0], x1 + ld1 {v1.4h}, [x0], x1 + ld1 {v2.4h}, [x0], x1 + ld1 {v3.4h}, [x0], x1 + + //load *pred + //ld1 {v4.4h,v5.4h,v6.4h,v7.4h,}, [x2], x3 + ld1 {v4.4h}, [x2], x3 + ld1 {v5.4h}, [x2], x3 + ld1 {v6.4h}, [x2], x3 + ld1 {v7.4h}, [x2], x3 + + //减 + sub v0.4h, v0.4h, v4.4h + sub v1.4h, v1.4h, v5.4h + sub v2.4h, v2.4h, v6.4h + sub v3.4h, v3.4h, v7.4h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.4h}, [x4], x5 + st1 {v1.4h}, [x4], x5 + st1 {v2.4h}, [x4], x5 + st1 {v3.4h}, [x4], x5 + + sub x6,x6,#4 + cmp x6,#0 + bgt diff_w4_loopx + b diff_w4_end + +diff_w4_end: + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_8_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_8_arm64 + + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + +diff_w8_loopx: + + //load *org + //ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], x1 + ld1 {v0.8h}, [x0], x1 + ld1 {v1.8h}, [x0], x1 + ld1 {v2.8h}, [x0], x1 + ld1 {v3.8h}, [x0], x1 + + //load *pred + //ld1 {v4.8h,v5.8h,v6.8h,v7.8h,}, [x2], x3 + ld1 {v4.8h}, [x2], x3 + ld1 {v5.8h}, [x2], x3 + ld1 {v6.8h}, [x2], x3 + ld1 {v7.8h}, [x2], x3 + + //减 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v6.8h + sub v3.8h, v3.8h, v7.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h}, [x4], x5 + st1 {v1.8h}, [x4], x5 + st1 {v2.8h}, [x4], x5 + st1 {v3.8h}, [x4], x5 + + sub x6,x6,#4 + cmp x6,#0 + bgt diff_w8_loopx + b diff_w8_end + +diff_w8_end: + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_16_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_16_arm64 + + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + +diff_w16_loopx: + + //load *org + //ld4 {v0.8h,v1.8h,v2.8h,v3.8h,v4.8h,v5.8h,v6.8h,v7.8h}, [x0], x1 + ld1 {v0.8h,v1.8h}, [x0], x1 + ld1 {v2.8h,v3.8h}, [x0], x1 + ld1 {v4.8h,v5.8h}, [x0], x1 + ld1 {v6.8h,v7.8h}, [x0], x1 + + //load *pred + //ld1 {v16.8h,v17.8h,v18.8h,v19.8h,v20.8h,v21.8h,v22.8h,v23.8h}, [x2], x3 + ld1 {v16.8h,v17.8h}, [x2], x3 + ld1 {v18.8h,v19.8h}, [x2], x3 + ld1 {v20.8h,v21.8h}, [x2], x3 + ld1 {v22.8h,v23.8h}, [x2], x3 + + //减 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v21.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v23.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h,v1.8h}, [x4], x5 + st1 {v2.8h,v3.8h}, [x4], x5 + st1 {v4.8h,v5.8h}, [x4], x5 + st1 {v6.8h,v7.8h}, [x4], x5 + + sub x6,x6,#4 + cmp x6,#0 + bgt diff_w16_loopx + b diff_w16_end + +diff_w16_end: + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_32_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_32_arm64 + + sub sp, sp, #64 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] + sub sp, sp, #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp] + + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + +diff_w32_loopx: + + //load *org + //ld4 {v0.8h-v15.8h}, [x0], x1 + ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], x1 + ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], x1 + ld1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x0], x1 + ld1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x0], x1 + + //load *pred + //ld1 {v16.8h-v31.8h}, [x2], x3 + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], x3 + ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], x3 + ld1 {v24.8h,v25.8h,v26.8h,v27.8h}, [x2], x3 + ld1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], x3 + + //减 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v21.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v23.8h + sub v8.8h, v8.8h, v24.8h + sub v9.8h, v9.8h, v25.8h + sub v10.8h, v10.8h, v26.8h + sub v11.8h, v11.8h, v27.8h + sub v12.8h, v12.8h, v28.8h + sub v13.8h, v13.8h, v29.8h + sub v14.8h, v14.8h, v30.8h + sub v15.8h, v15.8h, v31.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x4], x5 + st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x4], x5 + st1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x4], x5 + st1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x4], x5 + + sub x6,x6,#4 + cmp x6,#0 + bgt diff_w32_loopx + b diff_w32_end + +diff_w32_end: + + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_64_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_64_arm64 + + sub sp, sp, #64 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] + sub sp, sp, #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp] + + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + + sub x1, x1, #64 + sub x3, x3, #64 + sub x5, x5, #64 + +diff_w64_loopx: + + //load *org + //ld4 {v0.8h-v15.8h}, [x0], x1 + ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64 + ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], x1 + ld1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x0], #64 + ld1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x0], x1 + + //load *pred + //ld1 {v16.8h-v31.8h}, [x2], x3 + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64 + ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], x3 + ld1 {v24.8h,v25.8h,v26.8h,v27.8h}, [x2], #64 + ld1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], x3 + + //减 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v21.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v23.8h + sub v8.8h, v8.8h, v24.8h + sub v9.8h, v9.8h, v25.8h + sub v10.8h, v10.8h, v26.8h + sub v11.8h, v11.8h, v27.8h + sub v12.8h, v12.8h, v28.8h + sub v13.8h, v13.8h, v29.8h + sub v14.8h, v14.8h, v30.8h + sub v15.8h, v15.8h, v31.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x4], #64 + st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x4], x5 + st1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x4], #64 + st1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x4], x5 + + sub x6,x6,#2 + cmp x6,#0 + bgt diff_w64_loopx + b diff_w64_end + +diff_w64_end: + + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + + ret + +/****************************************************************************************************** +* void uavs3e_pel_diff_128_arm64(pel *p_org, int i_org, pel *p_pred, int i_pred, s16 *p_resi, int i_resi, int height) +* p_org->x0, i_org->x1, p_pred->x2, i_pred->x3, p_resi->x4, i_resi->x5, height>x6-> +******************************************************************************************************/ + +function uavs3e_pel_diff_128_arm64 + + sub sp, sp, #64 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp] + sub sp, sp, #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp] + + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + + sub x1, x1, #192 + sub x3, x3, #192 + sub x5, x5, #192 + +diff_w128_loopx: + + //load *org + //ld4 {v0.8h-v15.8h}, [x0], x1 + ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64 + ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64 + ld1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x0], #64 + ld1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x0], x1 + + + //load *pred + //ld1 {v16.8h-v31.8h}, [x2], x3 + ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64 + ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64 + ld1 {v24.8h,v25.8h,v26.8h,v27.8h}, [x2], #64 + ld1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], x3 + + + //减 + sub v0.8h, v0.8h, v16.8h + sub v1.8h, v1.8h, v17.8h + sub v2.8h, v2.8h, v18.8h + sub v3.8h, v3.8h, v19.8h + sub v4.8h, v4.8h, v20.8h + sub v5.8h, v5.8h, v21.8h + sub v6.8h, v6.8h, v22.8h + sub v7.8h, v7.8h, v23.8h + sub v8.8h, v8.8h, v24.8h + sub v9.8h, v9.8h, v25.8h + sub v10.8h, v10.8h, v26.8h + sub v11.8h, v11.8h, v27.8h + sub v12.8h, v12.8h, v28.8h + sub v13.8h, v13.8h, v29.8h + sub v14.8h, v14.8h, v30.8h + sub v15.8h, v15.8h, v31.8h + + //p_resi[i] = p_org[i] - p_pred[i] + st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x4], #64 + st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x4], #64 + st1 {v8.8h,v9.8h,v10.8h,v11.8h}, [x4], #64 + st1 {v12.8h,v13.8h,v14.8h,v15.8h}, [x4], x5 + + + sub x6,x6,#1 + cmp x6,#0 + bgt diff_w128_loopx + b diff_w128_end + +diff_w128_end: + + ld1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + + ret + #endif #endif \ No newline at end of file