From 91acfa7bcc0ee76768fe17347724534296a18858 Mon Sep 17 00:00:00 2001 From: dujiangpku <85744777+dujiangpku@users.noreply.github.com> Date: Thu, 30 Dec 2021 14:45:44 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=8F=98=E6=8D=A2dct2=2010bi?= =?UTF-8?q?t=E6=B1=87=E7=BC=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/src/main/jni/src/armv8/arm64.c | 54 +- .../src/main/jni/src/armv8/trans_dct2_arm64.S | 887 ++++++++++++++++++ .../src/main/jni/src/armv8/transform_arm64.c | 4 - 3 files changed, 914 insertions(+), 31 deletions(-) diff --git a/build/android/app/src/main/jni/src/armv8/arm64.c b/build/android/app/src/main/jni/src/armv8/arm64.c index eb24439..d26f5a9 100644 --- a/build/android/app/src/main/jni/src/armv8/arm64.c +++ b/build/android/app/src/main/jni/src/armv8/arm64.c @@ -51,33 +51,33 @@ void uavs3e_funs_init_arm64() uavs3e_funs_handle.alf = uavs3e_alf_filter_block_arm64; #else - //uavs3e_funs_handle.trans_dct2[1][1] = uavs3e_trans_dct2_w4_h4_arm64; - //uavs3e_funs_handle.trans_dct2[1][2] = uavs3e_trans_dct2_w4_h8_arm64; - //uavs3e_funs_handle.trans_dct2[1][3] = uavs3e_trans_dct2_w4_h16_arm64; - //uavs3e_funs_handle.trans_dct2[1][4] = uavs3e_trans_dct2_w4_h32_arm64; - - //uavs3e_funs_handle.trans_dct2[2][1] = uavs3e_trans_dct2_w8_h4_arm64; - //uavs3e_funs_handle.trans_dct2[2][2] = uavs3e_trans_dct2_w8_h8_arm64; - //uavs3e_funs_handle.trans_dct2[2][3] = uavs3e_trans_dct2_w8_h16_arm64; - //uavs3e_funs_handle.trans_dct2[2][4] = uavs3e_trans_dct2_w8_h32_arm64; - //uavs3e_funs_handle.trans_dct2[2][5] = uavs3e_trans_dct2_w8_h64_arm64; - - //uavs3e_funs_handle.trans_dct2[3][1] = uavs3e_trans_dct2_w16_h16_arm64; - //uavs3e_funs_handle.trans_dct2[3][2] = uavs3e_trans_dct2_w16_h8_arm64; - //uavs3e_funs_handle.trans_dct2[3][3] = uavs3e_trans_dct2_w16_h16_arm64; - //uavs3e_funs_handle.trans_dct2[3][4] = uavs3e_trans_dct2_w16_h32_arm64; - //uavs3e_funs_handle.trans_dct2[3][5] = uavs3e_trans_dct2_w16_h64_arm64; - - //uavs3e_funs_handle.trans_dct2[4][1] = uavs3e_trans_dct2_w32_h4_arm64; - //uavs3e_funs_handle.trans_dct2[4][2] = uavs3e_trans_dct2_w32_h8_arm64; - //uavs3e_funs_handle.trans_dct2[4][3] = uavs3e_trans_dct2_w32_h16_arm64; - //uavs3e_funs_handle.trans_dct2[4][4] = uavs3e_trans_dct2_w32_h32_arm64; - //uavs3e_funs_handle.trans_dct2[4][5] = uavs3e_trans_dct2_w32_h64_arm64; - - //uavs3e_funs_handle.trans_dct2[5][2] = uavs3e_trans_dct2_w64_h8_arm64; - //uavs3e_funs_handle.trans_dct2[5][3] = uavs3e_trans_dct2_w64_h16_arm64; - //uavs3e_funs_handle.trans_dct2[5][4] = uavs3e_trans_dct2_w64_h32_arm64; - //uavs3e_funs_handle.trans_dct2[5][5] = uavs3e_trans_dct2_w64_h64_arm64; + uavs3e_funs_handle.trans_dct2[1][1] = uavs3e_trans_dct2_w4_h4_arm64; + uavs3e_funs_handle.trans_dct2[1][2] = uavs3e_trans_dct2_w4_h8_arm64; + uavs3e_funs_handle.trans_dct2[1][3] = uavs3e_trans_dct2_w4_h16_arm64; + uavs3e_funs_handle.trans_dct2[1][4] = uavs3e_trans_dct2_w4_h32_arm64; + + uavs3e_funs_handle.trans_dct2[2][1] = uavs3e_trans_dct2_w8_h4_arm64; + uavs3e_funs_handle.trans_dct2[2][2] = uavs3e_trans_dct2_w8_h8_arm64; + uavs3e_funs_handle.trans_dct2[2][3] = uavs3e_trans_dct2_w8_h16_arm64; + uavs3e_funs_handle.trans_dct2[2][4] = uavs3e_trans_dct2_w8_h32_arm64; + uavs3e_funs_handle.trans_dct2[2][5] = uavs3e_trans_dct2_w8_h64_arm64; + + uavs3e_funs_handle.trans_dct2[3][1] = uavs3e_trans_dct2_w16_h4_arm64; + uavs3e_funs_handle.trans_dct2[3][2] = uavs3e_trans_dct2_w16_h8_arm64; + uavs3e_funs_handle.trans_dct2[3][3] = uavs3e_trans_dct2_w16_h16_arm64; + uavs3e_funs_handle.trans_dct2[3][4] = uavs3e_trans_dct2_w16_h32_arm64; + uavs3e_funs_handle.trans_dct2[3][5] = uavs3e_trans_dct2_w16_h64_arm64; + + uavs3e_funs_handle.trans_dct2[4][1] = uavs3e_trans_dct2_w32_h4_arm64; + uavs3e_funs_handle.trans_dct2[4][2] = uavs3e_trans_dct2_w32_h8_arm64; + uavs3e_funs_handle.trans_dct2[4][3] = uavs3e_trans_dct2_w32_h16_arm64; + uavs3e_funs_handle.trans_dct2[4][4] = uavs3e_trans_dct2_w32_h32_arm64; + uavs3e_funs_handle.trans_dct2[4][5] = uavs3e_trans_dct2_w32_h64_arm64; + + uavs3e_funs_handle.trans_dct2[5][2] = uavs3e_trans_dct2_w64_h8_arm64; + uavs3e_funs_handle.trans_dct2[5][3] = uavs3e_trans_dct2_w64_h16_arm64; + uavs3e_funs_handle.trans_dct2[5][4] = uavs3e_trans_dct2_w64_h32_arm64; + uavs3e_funs_handle.trans_dct2[5][5] = uavs3e_trans_dct2_w64_h64_arm64; uavs3e_funs_handle.itrans_dct2[1][1] = uavs3e_itrans_dct2_h4_w4_arm64; uavs3e_funs_handle.itrans_dct2[1][2] = uavs3e_itrans_dct2_h4_w8_arm64; diff --git a/build/android/app/src/main/jni/src/armv8/trans_dct2_arm64.S b/build/android/app/src/main/jni/src/armv8/trans_dct2_arm64.S index de24834..2239282 100644 --- a/build/android/app/src/main/jni/src/armv8/trans_dct2_arm64.S +++ b/build/android/app/src/main/jni/src/armv8/trans_dct2_arm64.S @@ -4,10 +4,897 @@ #if !COMPILE_10BIT #else + +//************************************************************************************************* +//void tx_dct2_pb4_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift); +//x0: coeff blk, 16 bit +//x1: resi blk, 16 bit +//x2: blk width +//x3: limit_line +//x4: shift +//************************************************************************************************* function tx_dct2_pb4_arm64 + lsl x2, x2, #1 + + //transcode coeffs + mov w5, #32 + neg w6, w5 + mov w7, #42 + neg w8, w7 + mov w9, #17 + neg w10, w9 + mov v4.h[0], w5 + mov v4.h[1], w7 + mov v4.h[2], w5 + mov v4.h[3], w9 //32 42 32 17 + mov v5.h[0], w5 + mov v5.h[1], w9 + mov v5.h[2], w6 + mov v5.h[3], w8 //32 17 -32 -42 + mov v6.h[0], w5 + mov v6.h[1], w10 + mov v6.h[2], w6 + mov v6.h[3], w7 //32 -17 -32 42 + mov v7.h[0], w5 + mov v7.h[1], w8 + mov v7.h[2], w5 + mov v7.h[3], w10 //32 -42 32 -17 + + mov x6, #0 + +tx_dct2_pb4_loop: + //load src + ld1 {v0.4h - v3.4h}, [x0], #32 +// ld1 {v0.d}[0], [x0], x2 +// ld1 {v1.d}[0], [x0], x2 +// ld1 {v2.d}[0], [x0], x2 +// ld1 {v3.d}[0], [x0], x2 + + smull v16.4s, v4.4h, v0.h[0] + smlal v16.4s, v5.4h, v0.h[1] + smlal v16.4s, v6.4h, v0.h[2] + smlal v16.4s, v7.4h, v0.h[3] + smull v17.4s, v4.4h, v1.h[0] + smlal v17.4s, v5.4h, v1.h[1] + smlal v17.4s, v6.4h, v1.h[2] + smlal v17.4s, v7.4h, v1.h[3] + smull v18.4s, v4.4h, v2.h[0] + smlal v18.4s, v5.4h, v2.h[1] + smlal v18.4s, v6.4h, v2.h[2] + smlal v18.4s, v7.4h, v2.h[3] + smull v19.4s, v4.4h, v3.h[0] + smlal v19.4s, v5.4h, v3.h[1] + smlal v19.4s, v6.4h, v3.h[2] + smlal v19.4s, v7.4h, v3.h[3] + + cmp x4, #2 + bne tx_dct2_pb4_shift7 + sqrshrn v16.4h, v16.4s, #2 + sqrshrn v17.4h, v17.4s, #2 + sqrshrn v18.4h, v18.4s, #2 + sqrshrn v19.4h, v19.4s, #2 + b tx_dct2_pb4_end + +tx_dct2_pb4_shift7: + sqrshrn v16.4h, v16.4s, #7 + sqrshrn v17.4h, v17.4s, #7 + sqrshrn v18.4h, v18.4s, #7 + sqrshrn v19.4h, v19.4s, #7 + +tx_dct2_pb4_end: + add x5, x1, x6 + st4 {v16.h - v19.h}[0], [x5], x2 + st4 {v16.h - v19.h}[1], [x5], x2 + st4 {v16.h - v19.h}[2], [x5], x2 + st4 {v16.h - v19.h}[3], [x5], x2 + + add x6, x6, #8 + + subs x3, x3, #4 + bgt tx_dct2_pb4_loop ret +tx_dct2_pb8_coef: +.hword 32, 44, 42, 38, 32, 25, 17, 9, \ + 32, 38, 17, -9, -32, -44, -42, -25, \ + 32, 25, -17, -44, -32, 9, 42, 38, \ + 32, 9, -42, -25, 32, 38, -17, -44, \ + 32, -9, -42, 25, 32, -38, -17, 44, \ + 32, -25, -17, 44, -32, -9, 42, -38, \ + 32, -38, 17, 9, -32, 44, -42, 25, \ + 32, -44, 42, -38, 32, -25, 17, -9 + +//************************************************************************************************* +//void tx_dct2_pb8_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift); +//x0: coeff blk, 16 bit +//x1: resi blk, 16 bit +//x2: blk width +//x3: limit_line +//x4: shift +//************************************************************************************************* +function tx_dct2_pb8_arm64 + lsl x2, x2, #1 + mov x6, #0 + + adr x7, tx_dct2_pb8_coef + ld1 {v16.8h - v19.8h}, [x7], #64 + ld1 {v20.8h - v23.8h}, [x7] + +tx_dct2_pb8_loop: + //load src + ld1 {v0.8h - v3.8h}, [x0], #64 + + smull v24.4s, v16.4h, v0.h[0] + smull2 v25.4s, v16.8h, v0.h[0] + smlal v24.4s, v17.4h, v0.h[1] + smlal2 v25.4s, v17.8h, v0.h[1] + smlal v24.4s, v18.4h, v0.h[2] + smlal2 v25.4s, v18.8h, v0.h[2] + smlal v24.4s, v19.4h, v0.h[3] + smlal2 v25.4s, v19.8h, v0.h[3] + smlal v24.4s, v20.4h, v0.h[4] + smlal2 v25.4s, v20.8h, v0.h[4] + smlal v24.4s, v21.4h, v0.h[5] + smlal2 v25.4s, v21.8h, v0.h[5] + smlal v24.4s, v22.4h, v0.h[6] + smlal2 v25.4s, v22.8h, v0.h[6] + smlal v24.4s, v23.4h, v0.h[7] + smlal2 v25.4s, v23.8h, v0.h[7] + + smull v26.4s, v16.4h, v1.h[0] + smull2 v27.4s, v16.8h, v1.h[0] + smlal v26.4s, v17.4h, v1.h[1] + smlal2 v27.4s, v17.8h, v1.h[1] + smlal v26.4s, v18.4h, v1.h[2] + smlal2 v27.4s, v18.8h, v1.h[2] + smlal v26.4s, v19.4h, v1.h[3] + smlal2 v27.4s, v19.8h, v1.h[3] + smlal v26.4s, v20.4h, v1.h[4] + smlal2 v27.4s, v20.8h, v1.h[4] + smlal v26.4s, v21.4h, v1.h[5] + smlal2 v27.4s, v21.8h, v1.h[5] + smlal v26.4s, v22.4h, v1.h[6] + smlal2 v27.4s, v22.8h, v1.h[6] + smlal v26.4s, v23.4h, v1.h[7] + smlal2 v27.4s, v23.8h, v1.h[7] + + smull v28.4s, v16.4h, v2.h[0] + smull2 v29.4s, v16.8h, v2.h[0] + smlal v28.4s, v17.4h, v2.h[1] + smlal2 v29.4s, v17.8h, v2.h[1] + smlal v28.4s, v18.4h, v2.h[2] + smlal2 v29.4s, v18.8h, v2.h[2] + smlal v28.4s, v19.4h, v2.h[3] + smlal2 v29.4s, v19.8h, v2.h[3] + smlal v28.4s, v20.4h, v2.h[4] + smlal2 v29.4s, v20.8h, v2.h[4] + smlal v28.4s, v21.4h, v2.h[5] + smlal2 v29.4s, v21.8h, v2.h[5] + smlal v28.4s, v22.4h, v2.h[6] + smlal2 v29.4s, v22.8h, v2.h[6] + smlal v28.4s, v23.4h, v2.h[7] + smlal2 v29.4s, v23.8h, v2.h[7] + + smull v30.4s, v16.4h, v3.h[0] + smull2 v31.4s, v16.8h, v3.h[0] + smlal v30.4s, v17.4h, v3.h[1] + smlal2 v31.4s, v17.8h, v3.h[1] + smlal v30.4s, v18.4h, v3.h[2] + smlal2 v31.4s, v18.8h, v3.h[2] + smlal v30.4s, v19.4h, v3.h[3] + smlal2 v31.4s, v19.8h, v3.h[3] + smlal v30.4s, v20.4h, v3.h[4] + smlal2 v31.4s, v20.8h, v3.h[4] + smlal v30.4s, v21.4h, v3.h[5] + smlal2 v31.4s, v21.8h, v3.h[5] + smlal v30.4s, v22.4h, v3.h[6] + smlal2 v31.4s, v22.8h, v3.h[6] + smlal v30.4s, v23.4h, v3.h[7] + smlal2 v31.4s, v23.8h, v3.h[7] + + cmp w4, #3 + bne tx_dct2_pb8_shift8 + sqrshrn v0.4h, v24.4s, #3 + sqrshrn v1.4h, v26.4s, #3 + sqrshrn v2.4h, v28.4s, #3 + sqrshrn v3.4h, v30.4s, #3 + sqrshrn v4.4h, v25.4s, #3 + sqrshrn v5.4h, v27.4s, #3 + sqrshrn v6.4h, v29.4s, #3 + sqrshrn v7.4h, v31.4s, #3 + b tx_dct2_pb8_end + +tx_dct2_pb8_shift8: + sqrshrn v0.4h, v24.4s, #8 + sqrshrn v1.4h, v26.4s, #8 + sqrshrn v2.4h, v28.4s, #8 + sqrshrn v3.4h, v30.4s, #8 + sqrshrn v4.4h, v25.4s, #8 + sqrshrn v5.4h, v27.4s, #8 + sqrshrn v6.4h, v29.4s, #8 + sqrshrn v7.4h, v31.4s, #8 + +tx_dct2_pb8_end: + add x5, x1, x6 + st4 {v0.h - v3.h}[0], [x5], x2 + st4 {v0.h - v3.h}[1], [x5], x2 + st4 {v0.h - v3.h}[2], [x5], x2 + st4 {v0.h - v3.h}[3], [x5], x2 + st4 {v4.h - v7.h}[0], [x5], x2 + st4 {v4.h - v7.h}[1], [x5], x2 + st4 {v4.h - v7.h}[2], [x5], x2 + st4 {v4.h - v7.h}[3], [x5], x2 + add x6, x6, #8 + + subs x3, x3, #4 + bgt tx_dct2_pb8_loop + + ret + + +tx_dct2_pb16_coef: +.hword 32, 45, 44, 43, 42, 40, 38, 35, 32, 29, 25, 21, 17, 13, 9, 4, \ + 32, 43, 38, 29, 17, 4, -9, -21, -32, -40, -44, -45, -42, -35, -25, -13, \ + 32, 40, 25, 4, -17, -35, -44, -43, -32, -13, 9, 29, 42, 45, 38, 21, \ + 32, 35, 9, -21, -42, -43, -25, 4, 32, 45, 38, 13, -17, -40, -44, -29, \ + 32, 29, -9, -40, -42, -13, 25, 45, 32, -4, -38, -43, -17, 21, 44, 35, \ + 32, 21, -25, -45, -17, 29, 44, 13, -32, -43, -9, 35, 42, 4, -38, -40, \ + 32, 13, -38, -35, 17, 45, 9, -40, -32, 21, 44, 4, -42, -29, 25, 43, \ + 32, 4, -44, -13, 42, 21, -38, -29, 32, 35, -25, -40, 17, 43, -9, -45, \ + 32, -4, -44, 13, 42, -21, -38, 29, 32, -35, -25, 40, 17, -43, -9, 45, \ + 32, -13, -38, 35, 17, -45, 9, 40, -32, -21, 44, -4, -42, 29, 25, -43, \ + 32, -21, -25, 45, -17, -29, 44, -13, -32, 43, -9, -35, 42, -4, -38, 40, \ + 32, -29, -9, 40, -42, 13, 25, -45, 32, 4, -38, 43, -17, -21, 44, -35, \ + 32, -35, 9, 21, -42, 43, -25, -4, 32, -45, 38, -13, -17, 40, -44, 29, \ + 32, -40, 25, -4, -17, 35, -44, 43, -32, 13, 9, -29, 42, -45, 38, -21, \ + 32, -43, 38, -29, 17, -4, -9, 21, -32, 40, -44, 45, -42, 35, -25, 13, \ + 32, -45, 44, -43, 42, -40, 38, -35, 32, -29, 25, -21, 17, -13, 9, -4 + +//************************************************************************************************* +//void tx_dct2_pb16_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift); +//x0: coeff blk, 16 bit +//x1: resi blk, 16 bit +//x2: blk width +//x3: limit_line +//x4: shift +//************************************************************************************************* +function tx_dct2_pb16_arm64 + lsl x2, x2, #1 + adr x7, tx_dct2_pb16_coef + mov x12, #32 //i_src + +tx_dct2_pb16_loopk: + mov x8, #0 + mov x11, x1 +tx_dct2_pb16_loopj: + mov x5, #0 + movi v24.16b, #0 + movi v25.16b, #0 + movi v26.16b, #0 + movi v27.16b, #0 + movi v28.16b, #0 + movi v29.16b, #0 + movi v30.16b, #0 + movi v31.16b, #0 +tx_dct2_pb16_loopi: + //load src + add x6, x0, x5 + ld1 {v0.8h}, [x6], x12 + ld1 {v1.8h}, [x6], x12 + ld1 {v2.8h}, [x6], x12 + ld1 {v3.8h}, [x6], x12 + + add x9, x7, x8 + ld1 {v16.8h}, [x9], x12 + ld1 {v17.8h}, [x9], x12 + ld1 {v18.8h}, [x9], x12 + ld1 {v19.8h}, [x9], x12 + ld1 {v20.8h}, [x9], x12 + ld1 {v21.8h}, [x9], x12 + ld1 {v22.8h}, [x9], x12 + ld1 {v23.8h}, [x9], x12 + + smlal v24.4s, v16.4h, v0.h[0] + smlal2 v25.4s, v16.8h, v0.h[0] + smlal v24.4s, v17.4h, v0.h[1] + smlal2 v25.4s, v17.8h, v0.h[1] + smlal v24.4s, v18.4h, v0.h[2] + smlal2 v25.4s, v18.8h, v0.h[2] + smlal v24.4s, v19.4h, v0.h[3] + smlal2 v25.4s, v19.8h, v0.h[3] + smlal v24.4s, v20.4h, v0.h[4] + smlal2 v25.4s, v20.8h, v0.h[4] + smlal v24.4s, v21.4h, v0.h[5] + smlal2 v25.4s, v21.8h, v0.h[5] + smlal v24.4s, v22.4h, v0.h[6] + smlal2 v25.4s, v22.8h, v0.h[6] + smlal v24.4s, v23.4h, v0.h[7] + smlal2 v25.4s, v23.8h, v0.h[7] + + smlal v26.4s, v16.4h, v1.h[0] + smlal2 v27.4s, v16.8h, v1.h[0] + smlal v26.4s, v17.4h, v1.h[1] + smlal2 v27.4s, v17.8h, v1.h[1] + smlal v26.4s, v18.4h, v1.h[2] + smlal2 v27.4s, v18.8h, v1.h[2] + smlal v26.4s, v19.4h, v1.h[3] + smlal2 v27.4s, v19.8h, v1.h[3] + smlal v26.4s, v20.4h, v1.h[4] + smlal2 v27.4s, v20.8h, v1.h[4] + smlal v26.4s, v21.4h, v1.h[5] + smlal2 v27.4s, v21.8h, v1.h[5] + smlal v26.4s, v22.4h, v1.h[6] + smlal2 v27.4s, v22.8h, v1.h[6] + smlal v26.4s, v23.4h, v1.h[7] + smlal2 v27.4s, v23.8h, v1.h[7] + + smlal v28.4s, v16.4h, v2.h[0] + smlal2 v29.4s, v16.8h, v2.h[0] + smlal v28.4s, v17.4h, v2.h[1] + smlal2 v29.4s, v17.8h, v2.h[1] + smlal v28.4s, v18.4h, v2.h[2] + smlal2 v29.4s, v18.8h, v2.h[2] + smlal v28.4s, v19.4h, v2.h[3] + smlal2 v29.4s, v19.8h, v2.h[3] + smlal v28.4s, v20.4h, v2.h[4] + smlal2 v29.4s, v20.8h, v2.h[4] + smlal v28.4s, v21.4h, v2.h[5] + smlal2 v29.4s, v21.8h, v2.h[5] + smlal v28.4s, v22.4h, v2.h[6] + smlal2 v29.4s, v22.8h, v2.h[6] + smlal v28.4s, v23.4h, v2.h[7] + smlal2 v29.4s, v23.8h, v2.h[7] + + smlal v30.4s, v16.4h, v3.h[0] + smlal2 v31.4s, v16.8h, v3.h[0] + smlal v30.4s, v17.4h, v3.h[1] + smlal2 v31.4s, v17.8h, v3.h[1] + smlal v30.4s, v18.4h, v3.h[2] + smlal2 v31.4s, v18.8h, v3.h[2] + smlal v30.4s, v19.4h, v3.h[3] + smlal2 v31.4s, v19.8h, v3.h[3] + smlal v30.4s, v20.4h, v3.h[4] + smlal2 v31.4s, v20.8h, v3.h[4] + smlal v30.4s, v21.4h, v3.h[5] + smlal2 v31.4s, v21.8h, v3.h[5] + smlal v30.4s, v22.4h, v3.h[6] + smlal2 v31.4s, v22.8h, v3.h[6] + smlal v30.4s, v23.4h, v3.h[7] + smlal2 v31.4s, v23.8h, v3.h[7] + + add x5, x5, #16 + add x8, x8, #256 + cmp x5, #16 + beq tx_dct2_pb16_loopi + + cmp w4, #4 + bne tx_dct2_pb16_shift9 + sqrshrn v0.4h, v24.4s, #4 + sqrshrn v1.4h, v26.4s, #4 + sqrshrn v2.4h, v28.4s, #4 + sqrshrn v3.4h, v30.4s, #4 + sqrshrn v4.4h, v25.4s, #4 + sqrshrn v5.4h, v27.4s, #4 + sqrshrn v6.4h, v29.4s, #4 + sqrshrn v7.4h, v31.4s, #4 + b tx_dct2_pb16_end + +tx_dct2_pb16_shift9: + sqrshrn v0.4h, v24.4s, #9 + sqrshrn v1.4h, v26.4s, #9 + sqrshrn v2.4h, v28.4s, #9 + sqrshrn v3.4h, v30.4s, #9 + sqrshrn v4.4h, v25.4s, #9 + sqrshrn v5.4h, v27.4s, #9 + sqrshrn v6.4h, v29.4s, #9 + sqrshrn v7.4h, v31.4s, #9 + +tx_dct2_pb16_end: + st4 {v0.h - v3.h}[0], [x11], x2 + st4 {v0.h - v3.h}[1], [x11], x2 + st4 {v0.h - v3.h}[2], [x11], x2 + st4 {v0.h - v3.h}[3], [x11], x2 + st4 {v4.h - v7.h}[0], [x11], x2 + st4 {v4.h - v7.h}[1], [x11], x2 + st4 {v4.h - v7.h}[2], [x11], x2 + st4 {v4.h - v7.h}[3], [x11], x2 + + sub x8, x8, #496 + cmp x8, #16 + beq tx_dct2_pb16_loopj + + add x0, x0, #128 + add x1, x1, #8 + subs x3, x3, #4 + bgt tx_dct2_pb16_loopk + + ret + +tx_dct2_pb32_coef: +.hword 32, 45, 45, 45, 44, 44, 43, 43, 42, 41, 40, 39, 38, 36, 35, 34, 32, 30, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 4, 2, \ + 32, 45, 43, 41, 38, 34, 29, 23, 17, 11, 4, -2, -9, -15, -21, -27, -32, -36, -40, -43, -44, -45, -45, -44, -42, -39, -35, -30, -25, -19, -13, -7, \ + 32, 44, 40, 34, 25, 15, 4, -7, -17, -27, -35, -41, -44, -45, -43, -39, -32, -23, -13, -2, 9, 19, 29, 36, 42, 45, 45, 43, 38, 30, 21, 11, \ + 32, 43, 35, 23, 9, -7, -21, -34, -42, -45, -43, -36, -25, -11, 4, 19, 32, 41, 45, 44, 38, 27, 13, -2, -17, -30, -40, -45, -44, -39, -29, -15, \ + 32, 41, 29, 11, -9, -27, -40, -45, -42, -30, -13, 7, 25, 39, 45, 43, 32, 15, -4, -23, -38, -45, -43, -34, -17, 2, 21, 36, 44, 44, 35, 19, \ + 32, 39, 21, -2, -25, -41, -45, -36, -17, 7, 29, 43, 44, 34, 13, -11, -32, -44, -43, -30, -9, 15, 35, 45, 42, 27, 4, -19, -38, -45, -40, -23, \ + 32, 36, 13, -15, -38, -45, -35, -11, 17, 39, 45, 34, 9, -19, -40, -45, -32, -7, 21, 41, 44, 30, 4, -23, -42, -44, -29, -2, 25, 43, 43, 27, \ + 32, 34, 4, -27, -44, -39, -13, 19, 42, 43, 21, -11, -38, -45, -29, 2, 32, 45, 35, 7, -25, -44, -40, -15, 17, 41, 43, 23, -9, -36, -45, -30, \ + 32, 30, -4, -36, -44, -23, 13, 41, 42, 15, -21, -44, -38, -7, 29, 45, 32, -2, -35, -45, -25, 11, 40, 43, 17, -19, -43, -39, -9, 27, 45, 34, \ + 32, 27, -13, -43, -38, -2, 35, 44, 17, -23, -45, -30, 9, 41, 40, 7, -32, -45, -21, 19, 44, 34, -4, -39, -42, -11, 29, 45, 25, -15, -43, -36, \ + 32, 23, -21, -45, -25, 19, 45, 27, -17, -45, -29, 15, 44, 30, -13, -44, -32, 11, 43, 34, -9, -43, -35, 7, 42, 36, -4, -41, -38, 2, 40, 39, \ + 32, 19, -29, -44, -9, 36, 40, -2, -42, -34, 13, 45, 25, -23, -45, -15, 32, 43, 4, -39, -38, 7, 43, 30, -17, -45, -21, 27, 44, 11, -35, -41, \ + 32, 15, -35, -39, 9, 45, 21, -30, -42, 2, 43, 27, -25, -44, -4, 41, 32, -19, -45, -11, 38, 36, -13, -45, -17, 34, 40, -7, -44, -23, 29, 43, \ + 32, 11, -40, -30, 25, 43, -4, -45, -17, 36, 35, -19, -44, -2, 43, 23, -32, -39, 13, 45, 9, -41, -29, 27, 42, -7, -45, -15, 38, 34, -21, -44, \ + 32, 7, -43, -19, 38, 30, -29, -39, 17, 44, -4, -45, -9, 43, 21, -36, -32, 27, 40, -15, -44, 2, 45, 11, -42, -23, 35, 34, -25, -41, 13, 45, \ + 32, 2, -45, -7, 44, 11, -43, -15, 42, 19, -40, -23, 38, 27, -35, -30, 32, 34, -29, -36, 25, 39, -21, -41, 17, 43, -13, -44, 9, 45, -4, -45, \ + 32, -2, -45, 7, 44, -11, -43, 15, 42, -19, -40, 23, 38, -27, -35, 30, 32, -34, -29, 36, 25, -39, -21, 41, 17, -43, -13, 44, 9, -45, -4, 45, \ + 32, -7, -43, 19, 38, -30, -29, 39, 17, -44, -4, 45, -9, -43, 21, 36, -32, -27, 40, 15, -44, -2, 45, -11, -42, 23, 35, -34, -25, 41, 13, -45, \ + 32, -11, -40, 30, 25, -43, -4, 45, -17, -36, 35, 19, -44, 2, 43, -23, -32, 39, 13, -45, 9, 41, -29, -27, 42, 7, -45, 15, 38, -34, -21, 44, \ + 32, -15, -35, 39, 9, -45, 21, 30, -42, -2, 43, -27, -25, 44, -4, -41, 32, 19, -45, 11, 38, -36, -13, 45, -17, -34, 40, 7, -44, 23, 29, -43, \ + 32, -19, -29, 44, -9, -36, 40, 2, -42, 34, 13, -45, 25, 23, -45, 15, 32, -43, 4, 39, -38, -7, 43, -30, -17, 45, -21, -27, 44, -11, -35, 41, \ + 32, -23, -21, 45, -25, -19, 45, -27, -17, 45, -29, -15, 44, -30, -13, 44, -32, -11, 43, -34, -9, 43, -35, -7, 42, -36, -4, 41, -38, -2, 40, -39, \ + 32, -27, -13, 43, -38, 2, 35, -44, 17, 23, -45, 30, 9, -41, 40, -7, -32, 45, -21, -19, 44, -34, -4, 39, -42, 11, 29, -45, 25, 15, -43, 36, \ + 32, -30, -4, 36, -44, 23, 13, -41, 42, -15, -21, 44, -38, 7, 29, -45, 32, 2, -35, 45, -25, -11, 40, -43, 17, 19, -43, 39, -9, -27, 45, -34, \ + 32, -34, 4, 27, -44, 39, -13, -19, 42, -43, 21, 11, -38, 45, -29, -2, 32, -45, 35, -7, -25, 44, -40, 15, 17, -41, 43, -23, -9, 36, -45, 30, \ + 32, -36, 13, 15, -38, 45, -35, 11, 17, -39, 45, -34, 9, 19, -40, 45, -32, 7, 21, -41, 44, -30, 4, 23, -42, 44, -29, 2, 25, -43, 43, -27, \ + 32, -39, 21, 2, -25, 41, -45, 36, -17, -7, 29, -43, 44, -34, 13, 11, -32, 44, -43, 30, -9, -15, 35, -45, 42, -27, 4, 19, -38, 45, -40, 23, \ + 32, -41, 29, -11, -9, 27, -40, 45, -42, 30, -13, -7, 25, -39, 45, -43, 32, -15, -4, 23, -38, 45, -43, 34, -17, -2, 21, -36, 44, -44, 35, -19, \ + 32, -43, 35, -23, 9, 7, -21, 34, -42, 45, -43, 36, -25, 11, 4, -19, 32, -41, 45, -44, 38, -27, 13, 2, -17, 30, -40, 45, -44, 39, -29, 15, \ + 32, -44, 40, -34, 25, -15, 4, 7, -17, 27, -35, 41, -44, 45, -43, 39, -32, 23, -13, 2, 9, -19, 29, -36, 42, -45, 45, -43, 38, -30, 21, -11, \ + 32, -45, 43, -41, 38, -34, 29, -23, 17, -11, 4, 2, -9, 15, -21, 27, -32, 36, -40, 43, -44, 45, -45, 44, -42, 39, -35, 30, -25, 19, -13, 7, \ + 32, -45, 45, -45, 44, -44, 43, -43, 42, -41, 40, -39, 38, -36, 35, -34, 32, -30, 29, -27, 25, -23, 21, -19, 17, -15, 13, -11, 9, -7, 4, -2 +//************************************************************************************************* +//void tx_dct2_pb32_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift); +//x0: coeff blk, 16 bit +//x1: resi blk, 16 bit +//x2: blk width +//x3: limit_line +//x4: shift +//************************************************************************************************* +function tx_dct2_pb32_arm64 + lsl x2, x2, #1 + adr x7, tx_dct2_pb32_coef + mov x12, #64 //i_src + +tx_dct2_pb32_loopk: + mov x8, #0 + mov x11, x1 +tx_dct2_pb32_loopj: + mov x5, #0 + movi v24.16b, #0 + movi v25.16b, #0 + movi v26.16b, #0 + movi v27.16b, #0 + movi v28.16b, #0 + movi v29.16b, #0 + movi v30.16b, #0 + movi v31.16b, #0 +tx_dct2_pb32_loopi: + //load src + add x6, x0, x5 + ld1 {v0.8h}, [x6], x12 + ld1 {v1.8h}, [x6], x12 + ld1 {v2.8h}, [x6], x12 + ld1 {v3.8h}, [x6], x12 + + add x9, x7, x8 + ld1 {v16.8h}, [x9], x12 + ld1 {v17.8h}, [x9], x12 + ld1 {v18.8h}, [x9], x12 + ld1 {v19.8h}, [x9], x12 + ld1 {v20.8h}, [x9], x12 + ld1 {v21.8h}, [x9], x12 + ld1 {v22.8h}, [x9], x12 + ld1 {v23.8h}, [x9], x12 + + smlal v24.4s, v16.4h, v0.h[0] + smlal2 v25.4s, v16.8h, v0.h[0] + smlal v24.4s, v17.4h, v0.h[1] + smlal2 v25.4s, v17.8h, v0.h[1] + smlal v24.4s, v18.4h, v0.h[2] + smlal2 v25.4s, v18.8h, v0.h[2] + smlal v24.4s, v19.4h, v0.h[3] + smlal2 v25.4s, v19.8h, v0.h[3] + smlal v24.4s, v20.4h, v0.h[4] + smlal2 v25.4s, v20.8h, v0.h[4] + smlal v24.4s, v21.4h, v0.h[5] + smlal2 v25.4s, v21.8h, v0.h[5] + smlal v24.4s, v22.4h, v0.h[6] + smlal2 v25.4s, v22.8h, v0.h[6] + smlal v24.4s, v23.4h, v0.h[7] + smlal2 v25.4s, v23.8h, v0.h[7] + + smlal v26.4s, v16.4h, v1.h[0] + smlal2 v27.4s, v16.8h, v1.h[0] + smlal v26.4s, v17.4h, v1.h[1] + smlal2 v27.4s, v17.8h, v1.h[1] + smlal v26.4s, v18.4h, v1.h[2] + smlal2 v27.4s, v18.8h, v1.h[2] + smlal v26.4s, v19.4h, v1.h[3] + smlal2 v27.4s, v19.8h, v1.h[3] + smlal v26.4s, v20.4h, v1.h[4] + smlal2 v27.4s, v20.8h, v1.h[4] + smlal v26.4s, v21.4h, v1.h[5] + smlal2 v27.4s, v21.8h, v1.h[5] + smlal v26.4s, v22.4h, v1.h[6] + smlal2 v27.4s, v22.8h, v1.h[6] + smlal v26.4s, v23.4h, v1.h[7] + smlal2 v27.4s, v23.8h, v1.h[7] + + smlal v28.4s, v16.4h, v2.h[0] + smlal2 v29.4s, v16.8h, v2.h[0] + smlal v28.4s, v17.4h, v2.h[1] + smlal2 v29.4s, v17.8h, v2.h[1] + smlal v28.4s, v18.4h, v2.h[2] + smlal2 v29.4s, v18.8h, v2.h[2] + smlal v28.4s, v19.4h, v2.h[3] + smlal2 v29.4s, v19.8h, v2.h[3] + smlal v28.4s, v20.4h, v2.h[4] + smlal2 v29.4s, v20.8h, v2.h[4] + smlal v28.4s, v21.4h, v2.h[5] + smlal2 v29.4s, v21.8h, v2.h[5] + smlal v28.4s, v22.4h, v2.h[6] + smlal2 v29.4s, v22.8h, v2.h[6] + smlal v28.4s, v23.4h, v2.h[7] + smlal2 v29.4s, v23.8h, v2.h[7] + + smlal v30.4s, v16.4h, v3.h[0] + smlal2 v31.4s, v16.8h, v3.h[0] + smlal v30.4s, v17.4h, v3.h[1] + smlal2 v31.4s, v17.8h, v3.h[1] + smlal v30.4s, v18.4h, v3.h[2] + smlal2 v31.4s, v18.8h, v3.h[2] + smlal v30.4s, v19.4h, v3.h[3] + smlal2 v31.4s, v19.8h, v3.h[3] + smlal v30.4s, v20.4h, v3.h[4] + smlal2 v31.4s, v20.8h, v3.h[4] + smlal v30.4s, v21.4h, v3.h[5] + smlal2 v31.4s, v21.8h, v3.h[5] + smlal v30.4s, v22.4h, v3.h[6] + smlal2 v31.4s, v22.8h, v3.h[6] + smlal v30.4s, v23.4h, v3.h[7] + smlal2 v31.4s, v23.8h, v3.h[7] + + add x5, x5, #16 + add x8, x8, #512 + cmp x5, #64 + bne tx_dct2_pb32_loopi + + cmp w4, #5 + bne tx_dct2_pb32_shift10 + sqrshrn v0.4h, v24.4s, #5 + sqrshrn v1.4h, v26.4s, #5 + sqrshrn v2.4h, v28.4s, #5 + sqrshrn v3.4h, v30.4s, #5 + sqrshrn v4.4h, v25.4s, #5 + sqrshrn v5.4h, v27.4s, #5 + sqrshrn v6.4h, v29.4s, #5 + sqrshrn v7.4h, v31.4s, #5 + b tx_dct2_pb32_end + +tx_dct2_pb32_shift10: + sqrshrn v0.4h, v24.4s, #10 + sqrshrn v1.4h, v26.4s, #10 + sqrshrn v2.4h, v28.4s, #10 + sqrshrn v3.4h, v30.4s, #10 + sqrshrn v4.4h, v25.4s, #10 + sqrshrn v5.4h, v27.4s, #10 + sqrshrn v6.4h, v29.4s, #10 + sqrshrn v7.4h, v31.4s, #10 + +tx_dct2_pb32_end: + st4 {v0.h - v3.h}[0], [x11], x2 + st4 {v0.h - v3.h}[1], [x11], x2 + st4 {v0.h - v3.h}[2], [x11], x2 + st4 {v0.h - v3.h}[3], [x11], x2 + st4 {v4.h - v7.h}[0], [x11], x2 + st4 {v4.h - v7.h}[1], [x11], x2 + st4 {v4.h - v7.h}[2], [x11], x2 + st4 {v4.h - v7.h}[3], [x11], x2 + + sub x8, x8, #2032 + cmp x8, #64 + bne tx_dct2_pb32_loopj + + add x0, x0, #256 + add x1, x1, #8 + subs x3, x3, #4 + bgt tx_dct2_pb32_loopk + + ret + +tx_dct2_pb64_coef: +.hword 32, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 43, 43, 43, 42, 42, 41, 41, 40, 40, 39, 39, 38, 38, 37, 36, 36, 35, 34, 34, 33, \ + 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 4, 3, 2, 1, \ + 32, 45, 45, 44, 43, 42, 41, 39, 38, 36, 34, 31, 29, 26, 23, 20, 17, 14, 11, 8, 4, 1, -2, -6, -9, -12, -15, -18, -21, -24, -27, -30, \ + -32, -34, -36, -38, -40, -41, -43, -44, -44, -45, -45, -45, -45, -45, -44, -43, -42, -40, -39, -37, -35, -33, -30, -28, -25, -22, -19, -16, -13, -10, -7, -3, \ + 32, 45, 44, 42, 40, 37, 34, 30, 25, 20, 15, 10, 4, -1, -7, -12, -17, -22, -27, -31, -35, -38, -41, -43, -44, -45, -45, -45, -43, -41, -39, -36, \ + -32, -28, -23, -18, -13, -8, -2, 3, 9, 14, 19, 24, 29, 33, 36, 39, 42, 44, 45, 45, 45, 44, 43, 40, 38, 34, 30, 26, 21, 16, 11, 6, \ + 32, 45, 43, 39, 35, 30, 23, 16, 9, 1, -7, -14, -21, -28, -34, -38, -42, -44, -45, -45, -43, -40, -36, -31, -25, -18, -11, -3, 4, 12, 19, 26, \ + 32, 37, 41, 44, 45, 45, 44, 41, 38, 33, 27, 20, 13, 6, -2, -10, -17, -24, -30, -36, -40, -43, -45, -45, -44, -42, -39, -34, -29, -22, -15, -8, \ + 32, 44, 41, 36, 29, 20, 11, 1, -9, -18, -27, -34, -40, -44, -45, -45, -42, -37, -30, -22, -13, -3, 7, 16, 25, 33, 39, 43, 45, 45, 43, 38, \ + 32, 24, 15, 6, -4, -14, -23, -31, -38, -42, -45, -45, -43, -39, -34, -26, -17, -8, 2, 12, 21, 30, 36, 41, 44, 45, 44, 40, 35, 28, 19, 10, \ + 32, 44, 39, 31, 21, 10, -2, -14, -25, -34, -41, -45, -45, -42, -36, -28, -17, -6, 7, 18, 29, 37, 43, 45, 44, 40, 34, 24, 13, 1, -11, -22, \ + -32, -39, -44, -45, -43, -38, -30, -20, -9, 3, 15, 26, 35, 41, 45, 45, 42, 36, 27, 16, 4, -8, -19, -30, -38, -43, -45, -44, -40, -33, -23, -12, \ + 32, 43, 36, 26, 13, -1, -15, -28, -38, -44, -45, -42, -35, -24, -11, 3, 17, 30, 39, 44, 45, 41, 34, 22, 9, -6, -19, -31, -40, -45, -45, -40, \ + -32, -20, -7, 8, 21, 33, 41, 45, 44, 39, 30, 18, 4, -10, -23, -34, -42, -45, -44, -38, -29, -16, -2, 12, 25, 36, 43, 45, 43, 37, 27, 14, \ + 32, 42, 34, 20, 4, -12, -27, -38, -44, -45, -39, -28, -13, 3, 19, 33, 42, 45, 43, 34, 21, 6, -11, -26, -38, -44, -45, -39, -29, -14, 2, 18, \ + 32, 41, 45, 43, 35, 22, 7, -10, -25, -37, -44, -45, -40, -30, -15, 1, 17, 31, 41, 45, 43, 36, 23, 8, -9, -24, -36, -44, -45, -40, -30, -16, \ + 32, 41, 30, 14, -4, -22, -36, -44, -44, -37, -23, -6, 13, 30, 41, 45, 42, 31, 15, -3, -21, -36, -44, -45, -38, -24, -7, 12, 29, 40, 45, 42, \ + 32, 16, -2, -20, -35, -44, -45, -38, -25, -8, 11, 28, 40, 45, 43, 33, 17, -1, -19, -34, -43, -45, -39, -26, -9, 10, 27, 39, 45, 43, 34, 18, \ + 32, 40, 27, 8, -13, -31, -43, -45, -38, -22, -2, 18, 35, 44, 44, 34, 17, -3, -23, -38, -45, -42, -30, -12, 9, 28, 41, 45, 40, 26, 7, -14, \ + -32, -43, -45, -37, -21, -1, 19, 36, 44, 44, 34, 16, -4, -24, -39, -45, -42, -30, -11, 10, 29, 41, 45, 39, 25, 6, -15, -33, -43, -45, -36, -20, \ + 32, 39, 23, 1, -21, -38, -45, -40, -25, -3, 19, 37, 45, 41, 27, 6, -17, -36, -45, -42, -29, -8, 15, 34, 44, 43, 30, 10, -13, -33, -44, -44, \ + -32, -12, 11, 31, 43, 44, 34, 14, -9, -30, -43, -45, -35, -16, 7, 28, 42, 45, 36, 18, -4, -26, -41, -45, -38, -20, 2, 24, 40, 45, 39, 22, \ + 32, 38, 19, -6, -29, -43, -44, -31, -9, 16, 36, 45, 40, 22, -2, -26, -42, -45, -34, -12, 13, 34, 45, 41, 25, 1, -23, -40, -45, -36, -15, 10, \ + 32, 44, 43, 28, 4, -20, -39, -45, -38, -18, 7, 30, 43, 44, 30, 8, -17, -37, -45, -39, -21, 3, 27, 42, 44, 33, 11, -14, -35, -45, -41, -24, \ + 32, 37, 15, -12, -35, -45, -39, -18, 9, 33, 45, 40, 21, -6, -30, -44, -42, -24, 2, 28, 43, 43, 27, 1, -25, -42, -44, -30, -4, 22, 41, 45, \ + 32, 8, -19, -39, -45, -34, -11, 16, 38, 45, 36, 14, -13, -36, -45, -38, -17, 10, 34, 45, 40, 20, -7, -31, -44, -41, -23, 3, 29, 44, 43, 26, \ + 32, 36, 11, -18, -40, -45, -30, -3, 25, 43, 43, 24, -4, -31, -45, -39, -17, 12, 36, 45, 35, 10, -19, -40, -44, -30, -2, 26, 43, 42, 23, -6, \ + -32, -45, -39, -16, 13, 37, 45, 34, 9, -20, -41, -44, -29, -1, 27, 44, 42, 22, -7, -33, -45, -38, -15, 14, 38, 45, 34, 8, -21, -41, -44, -28, \ + 32, 34, 7, -24, -43, -41, -19, 12, 38, 45, 30, 1, -29, -45, -39, -14, 17, 40, 44, 26, -4, -33, -45, -36, -9, 22, 43, 42, 21, -10, -36, -45, \ + -32, -3, 27, 44, 40, 16, -15, -39, -44, -28, 2, 31, 45, 37, 11, -20, -42, -43, -23, 8, 35, 45, 34, 6, -25, -44, -41, -18, 13, 38, 45, 30, \ + 32, 33, 2, -30, -45, -36, -7, 26, 44, 38, 11, -22, -43, -40, -15, 18, 42, 42, 19, -14, -40, -44, -23, 10, 38, 45, 27, -6, -35, -45, -30, 1, \ + 32, 45, 34, 3, -29, -45, -36, -8, 25, 44, 39, 12, -21, -43, -41, -16, 17, 41, 43, 20, -13, -39, -44, -24, 9, 37, 45, 28, -4, -34, -45, -31, \ + 32, 31, -2, -34, -45, -28, 7, 37, 44, 24, -11, -39, -43, -20, 15, 41, 42, 16, -19, -43, -40, -12, 23, 44, 38, 8, -27, -45, -35, -3, 30, 45, \ + 32, -1, -34, -45, -29, 6, 36, 45, 25, -10, -39, -44, -21, 14, 41, 42, 17, -18, -43, -40, -13, 22, 44, 38, 9, -26, -45, -36, -4, 30, 45, 33, \ + 32, 30, -7, -38, -43, -18, 19, 44, 38, 6, -30, -45, -29, 8, 39, 43, 17, -20, -44, -37, -4, 31, 45, 28, -9, -39, -43, -16, 21, 44, 36, 3, \ + -32, -45, -27, 10, 40, 42, 15, -22, -44, -36, -2, 33, 45, 26, -11, -40, -42, -14, 23, 45, 35, 1, -34, -45, -25, 12, 41, 41, 13, -24, -45, -34, \ + 32, 28, -11, -41, -40, -8, 30, 45, 25, -14, -43, -38, -4, 33, 45, 22, -17, -44, -36, -1, 35, 44, 19, -20, -44, -34, 2, 37, 43, 16, -23, -45, \ + -32, 6, 39, 42, 13, -26, -45, -30, 9, 40, 41, 10, -29, -45, -27, 12, 42, 39, 7, -31, -45, -24, 15, 43, 38, 3, -34, -45, -21, 18, 44, 36, \ + 32, 26, -15, -44, -35, 3, 39, 41, 9, -31, -45, -20, 21, 45, 30, -10, -42, -38, -2, 36, 43, 14, -27, -45, -25, 16, 44, 34, -4, -39, -41, -8, \ + 32, 45, 19, -22, -45, -30, 11, 42, 38, 1, -36, -43, -13, 28, 45, 24, -17, -44, -34, 6, 40, 40, 7, -33, -44, -18, 23, 45, 29, -12, -43, -37, \ + 32, 24, -19, -45, -29, 14, 44, 33, -9, -42, -36, 3, 40, 39, 2, -37, -42, -8, 34, 44, 13, -30, -45, -18, 25, 45, 23, -20, -45, -28, 15, 44, \ + 32, -10, -43, -36, 4, 40, 39, 1, -38, -41, -7, 34, 43, 12, -30, -45, -17, 26, 45, 22, -21, -45, -27, 16, 44, 31, -11, -43, -35, 6, 41, 38, \ + 32, 22, -23, -45, -21, 24, 45, 20, -25, -45, -19, 26, 45, 18, -27, -45, -17, 28, 45, 16, -29, -45, -15, 30, 44, 14, -30, -44, -13, 31, 44, 12, \ + -32, -44, -11, 33, 43, 10, -34, -43, -9, 34, 43, 8, -35, -42, -7, 36, 42, 6, -36, -41, -4, 37, 41, 3, -38, -40, -2, 38, 40, 1, -39, -39, \ + 32, 20, -27, -45, -13, 33, 43, 6, -38, -39, 2, 41, 35, -10, -44, -30, 17, 45, 23, -24, -45, -16, 30, 44, 9, -36, -41, -1, 40, 37, -7, -43, \ + -32, 14, 45, 26, -21, -45, -19, 28, 44, 12, -34, -42, -4, 38, 39, -3, -42, -34, 11, 44, 29, -18, -45, -22, 25, 45, 15, -31, -43, -8, 36, 40, \ + 32, 18, -30, -43, -4, 39, 36, -10, -44, -26, 23, 45, 13, -34, -41, 1, 42, 33, -15, -45, -21, 28, 44, 8, -38, -38, 7, 44, 29, -20, -45, -16, \ + 32, 42, 2, -40, -35, 12, 45, 24, -25, -45, -11, 36, 40, -3, -43, -31, 17, 45, 19, -30, -43, -6, 39, 37, -9, -44, -27, 22, 45, 14, -34, -41, \ + 32, 16, -34, -40, 4, 44, 27, -24, -44, -8, 39, 36, -13, -45, -19, 31, 42, -1, -43, -30, 21, 45, 11, -37, -38, 10, 45, 22, -29, -43, -2, 41, \ + 32, -18, -45, -14, 35, 39, -7, -44, -25, 26, 44, 6, -40, -34, 15, 45, 17, -33, -41, 3, 43, 28, -23, -45, -9, 38, 36, -12, -45, -20, 30, 42, \ + 32, 14, -36, -37, 13, 45, 15, -36, -38, 12, 45, 16, -35, -38, 11, 45, 17, -34, -39, 10, 45, 18, -34, -39, 9, 45, 19, -33, -40, 8, 45, 20, \ + -32, -40, 7, 45, 21, -31, -41, 6, 44, 22, -30, -41, 4, 44, 23, -30, -42, 3, 44, 24, -29, -42, 2, 44, 25, -28, -43, 1, 43, 26, -27, -43, \ + 32, 12, -39, -33, 21, 44, 2, -43, -25, 30, 41, -8, -45, -16, 36, 36, -17, -45, -7, 41, 29, -26, -43, 3, 44, 20, -34, -38, 13, 45, 11, -39, \ + -32, 22, 44, 1, -43, -24, 30, 40, -9, -45, -15, 37, 35, -18, -45, -6, 42, 28, -27, -42, 4, 45, 19, -34, -38, 14, 45, 10, -40, -31, 23, 44, \ + 32, 10, -41, -28, 29, 40, -11, -45, -9, 41, 27, -30, -40, 12, 45, 8, -42, -26, 30, 39, -13, -45, -7, 42, 25, -31, -39, 14, 45, 6, -43, -24, \ + 32, 38, -15, -45, -4, 43, 23, -33, -38, 16, 45, 3, -43, -22, 34, 37, -17, -45, -2, 44, 21, -34, -36, 18, 44, 1, -44, -20, 35, 36, -19, -44, \ + 32, 8, -43, -22, 35, 34, -23, -42, 9, 45, 7, -43, -21, 36, 34, -24, -42, 10, 45, 6, -43, -20, 36, 33, -25, -41, 11, 45, 4, -44, -19, 37, \ + 32, -26, -41, 12, 45, 3, -44, -18, 38, 31, -27, -40, 13, 45, 2, -44, -17, 38, 30, -28, -40, 14, 45, 1, -44, -16, 39, 30, -29, -39, 15, 45, \ + 32, 6, -44, -16, 40, 26, -34, -34, 25, 40, -15, -44, 4, 45, 7, -44, -17, 39, 27, -33, -35, 24, 41, -14, -44, 3, 45, 8, -43, -18, 39, 28, \ + -32, -36, 23, 41, -13, -45, 2, 45, 9, -43, -19, 38, 29, -31, -36, 22, 42, -12, -45, 1, 45, 10, -43, -20, 38, 30, -30, -37, 21, 42, -11, -45, \ + 32, 3, -45, -10, 43, 16, -41, -22, 38, 28, -34, -33, 29, 37, -23, -40, 17, 43, -11, -45, 4, 45, 2, -45, -9, 44, 15, -41, -21, 38, 27, -34, \ + -32, 30, 36, -24, -40, 18, 43, -12, -44, 6, 45, 1, -45, -8, 44, 14, -42, -20, 39, 26, -35, -31, 30, 36, -25, -39, 19, 42, -13, -44, 7, 45, \ + 32, 1, -45, -3, 45, 6, -45, -8, 44, 10, -44, -12, 43, 14, -43, -16, 42, 18, -41, -20, 40, 22, -39, -24, 38, 26, -36, -28, 35, 30, -34, -31, \ + 32, 33, -30, -34, 29, 36, -27, -37, 25, 38, -23, -39, 21, 40, -19, -41, 17, 42, -15, -43, 13, 44, -11, -44, 9, 45, -7, -45, 4, 45, -2, -45, \ + 32, -1, -45, 3, 45, -6, -45, 8, 44, -10, -44, 12, 43, -14, -43, 16, 42, -18, -41, 20, 40, -22, -39, 24, 38, -26, -36, 28, 35, -30, -34, 31, \ + 32, -33, -30, 34, 29, -36, -27, 37, 25, -38, -23, 39, 21, -40, -19, 41, 17, -42, -15, 43, 13, -44, -11, 44, 9, -45, -7, 45, 4, -45, -2, 45, \ + 32, -3, -45, 10, 43, -16, -41, 22, 38, -28, -34, 33, 29, -37, -23, 40, 17, -43, -11, 45, 4, -45, 2, 45, -9, -44, 15, 41, -21, -38, 27, 34, \ + -32, -30, 36, 24, -40, -18, 43, 12, -44, -6, 45, -1, -45, 8, 44, -14, -42, 20, 39, -26, -35, 31, 30, -36, -25, 39, 19, -42, -13, 44, 7, -45, \ + 32, -6, -44, 16, 40, -26, -34, 34, 25, -40, -15, 44, 4, -45, 7, 44, -17, -39, 27, 33, -35, -24, 41, 14, -44, -3, 45, -8, -43, 18, 39, -28, \ + -32, 36, 23, -41, -13, 45, 2, -45, 9, 43, -19, -38, 29, 31, -36, -22, 42, 12, -45, -1, 45, -10, -43, 20, 38, -30, -30, 37, 21, -42, -11, 45, \ + 32, -8, -43, 22, 35, -34, -23, 42, 9, -45, 7, 43, -21, -36, 34, 24, -42, -10, 45, -6, -43, 20, 36, -33, -25, 41, 11, -45, 4, 44, -19, -37, \ + 32, 26, -41, -12, 45, -3, -44, 18, 38, -31, -27, 40, 13, -45, 2, 44, -17, -38, 30, 28, -40, -14, 45, -1, -44, 16, 39, -30, -29, 39, 15, -45, \ + 32, -10, -41, 28, 29, -40, -11, 45, -9, -41, 27, 30, -40, -12, 45, -8, -42, 26, 30, -39, -13, 45, -7, -42, 25, 31, -39, -14, 45, -6, -43, 24, \ + 32, -38, -15, 45, -4, -43, 23, 33, -38, -16, 45, -3, -43, 22, 34, -37, -17, 45, -2, -44, 21, 34, -36, -18, 44, -1, -44, 20, 35, -36, -19, 44, \ + 32, -12, -39, 33, 21, -44, 2, 43, -25, -30, 41, 8, -45, 16, 36, -36, -17, 45, -7, -41, 29, 26, -43, -3, 44, -20, -34, 38, 13, -45, 11, 39, \ + -32, -22, 44, -1, -43, 24, 30, -40, -9, 45, -15, -37, 35, 18, -45, 6, 42, -28, -27, 42, 4, -45, 19, 34, -38, -14, 45, -10, -40, 31, 23, -44, \ + 32, -14, -36, 37, 13, -45, 15, 36, -38, -12, 45, -16, -35, 38, 11, -45, 17, 34, -39, -10, 45, -18, -34, 39, 9, -45, 19, 33, -40, -8, 45, -20, \ + -32, 40, 7, -45, 21, 31, -41, -6, 44, -22, -30, 41, 4, -44, 23, 30, -42, -3, 44, -24, -29, 42, 2, -44, 25, 28, -43, -1, 43, -26, -27, 43, \ + 32, -16, -34, 40, 4, -44, 27, 24, -44, 8, 39, -36, -13, 45, -19, -31, 42, 1, -43, 30, 21, -45, 11, 37, -38, -10, 45, -22, -29, 43, -2, -41, \ + 32, 18, -45, 14, 35, -39, -7, 44, -25, -26, 44, -6, -40, 34, 15, -45, 17, 33, -41, -3, 43, -28, -23, 45, -9, -38, 36, 12, -45, 20, 30, -42, \ + 32, -18, -30, 43, -4, -39, 36, 10, -44, 26, 23, -45, 13, 34, -41, -1, 42, -33, -15, 45, -21, -28, 44, -8, -38, 38, 7, -44, 29, 20, -45, 16, \ + 32, -42, 2, 40, -35, -12, 45, -24, -25, 45, -11, -36, 40, 3, -43, 31, 17, -45, 19, 30, -43, 6, 39, -37, -9, 44, -27, -22, 45, -14, -34, 41, \ + 32, -20, -27, 45, -13, -33, 43, -6, -38, 39, 2, -41, 35, 10, -44, 30, 17, -45, 23, 24, -45, 16, 30, -44, 9, 36, -41, 1, 40, -37, -7, 43, \ + -32, -14, 45, -26, -21, 45, -19, -28, 44, -12, -34, 42, -4, -38, 39, 3, -42, 34, 11, -44, 29, 18, -45, 22, 25, -45, 15, 31, -43, 8, 36, -40, \ + 32, -22, -23, 45, -21, -24, 45, -20, -25, 45, -19, -26, 45, -18, -27, 45, -17, -28, 45, -16, -29, 45, -15, -30, 44, -14, -30, 44, -13, -31, 44, -12, \ + -32, 44, -11, -33, 43, -10, -34, 43, -9, -34, 43, -8, -35, 42, -7, -36, 42, -6, -36, 41, -4, -37, 41, -3, -38, 40, -2, -38, 40, -1, -39, 39, \ + 32, -24, -19, 45, -29, -14, 44, -33, -9, 42, -36, -3, 40, -39, 2, 37, -42, 8, 34, -44, 13, 30, -45, 18, 25, -45, 23, 20, -45, 28, 15, -44, \ + 32, 10, -43, 36, 4, -40, 39, -1, -38, 41, -7, -34, 43, -12, -30, 45, -17, -26, 45, -22, -21, 45, -27, -16, 44, -31, -11, 43, -35, -6, 41, -38, \ + 32, -26, -15, 44, -35, -3, 39, -41, 9, 31, -45, 20, 21, -45, 30, 10, -42, 38, -2, -36, 43, -14, -27, 45, -25, -16, 44, -34, -4, 39, -41, 8, \ + 32, -45, 19, 22, -45, 30, 11, -42, 38, -1, -36, 43, -13, -28, 45, -24, -17, 44, -34, -6, 40, -40, 7, 33, -44, 18, 23, -45, 29, 12, -43, 37, \ + 32, -28, -11, 41, -40, 8, 30, -45, 25, 14, -43, 38, -4, -33, 45, -22, -17, 44, -36, 1, 35, -44, 19, 20, -44, 34, 2, -37, 43, -16, -23, 45, \ + -32, -6, 39, -42, 13, 26, -45, 30, 9, -40, 41, -10, -29, 45, -27, -12, 42, -39, 7, 31, -45, 24, 15, -43, 38, -3, -34, 45, -21, -18, 44, -36, \ + 32, -30, -7, 38, -43, 18, 19, -44, 38, -6, -30, 45, -29, -8, 39, -43, 17, 20, -44, 37, -4, -31, 45, -28, -9, 39, -43, 16, 21, -44, 36, -3, \ + -32, 45, -27, -10, 40, -42, 15, 22, -44, 36, -2, -33, 45, -26, -11, 40, -42, 14, 23, -45, 35, -1, -34, 45, -25, -12, 41, -41, 13, 24, -45, 34, \ + 32, -31, -2, 34, -45, 28, 7, -37, 44, -24, -11, 39, -43, 20, 15, -41, 42, -16, -19, 43, -40, 12, 23, -44, 38, -8, -27, 45, -35, 3, 30, -45, \ + 32, 1, -34, 45, -29, -6, 36, -45, 25, 10, -39, 44, -21, -14, 41, -42, 17, 18, -43, 40, -13, -22, 44, -38, 9, 26, -45, 36, -4, -30, 45, -33, \ + 32, -33, 2, 30, -45, 36, -7, -26, 44, -38, 11, 22, -43, 40, -15, -18, 42, -42, 19, 14, -40, 44, -23, -10, 38, -45, 27, 6, -35, 45, -30, -1, \ + 32, -45, 34, -3, -29, 45, -36, 8, 25, -44, 39, -12, -21, 43, -41, 16, 17, -41, 43, -20, -13, 39, -44, 24, 9, -37, 45, -28, -4, 34, -45, 31, \ + 32, -34, 7, 24, -43, 41, -19, -12, 38, -45, 30, -1, -29, 45, -39, 14, 17, -40, 44, -26, -4, 33, -45, 36, -9, -22, 43, -42, 21, 10, -36, 45, \ + -32, 3, 27, -44, 40, -16, -15, 39, -44, 28, 2, -31, 45, -37, 11, 20, -42, 43, -23, -8, 35, -45, 34, -6, -25, 44, -41, 18, 13, -38, 45, -30, \ + 32, -36, 11, 18, -40, 45, -30, 3, 25, -43, 43, -24, -4, 31, -45, 39, -17, -12, 36, -45, 35, -10, -19, 40, -44, 30, -2, -26, 43, -42, 23, 6, \ + -32, 45, -39, 16, 13, -37, 45, -34, 9, 20, -41, 44, -29, 1, 27, -44, 42, -22, -7, 33, -45, 38, -15, -14, 38, -45, 34, -8, -21, 41, -44, 28, \ + 32, -37, 15, 12, -35, 45, -39, 18, 9, -33, 45, -40, 21, 6, -30, 44, -42, 24, 2, -28, 43, -43, 27, -1, -25, 42, -44, 30, -4, -22, 41, -45, \ + 32, -8, -19, 39, -45, 34, -11, -16, 38, -45, 36, -14, -13, 36, -45, 38, -17, -10, 34, -45, 40, -20, -7, 31, -44, 41, -23, -3, 29, -44, 43, -26, \ + 32, -38, 19, 6, -29, 43, -44, 31, -9, -16, 36, -45, 40, -22, -2, 26, -42, 45, -34, 12, 13, -34, 45, -41, 25, -1, -23, 40, -45, 36, -15, -10, \ + 32, -44, 43, -28, 4, 20, -39, 45, -38, 18, 7, -30, 43, -44, 30, -8, -17, 37, -45, 39, -21, -3, 27, -42, 44, -33, 11, 14, -35, 45, -41, 24, \ + 32, -39, 23, -1, -21, 38, -45, 40, -25, 3, 19, -37, 45, -41, 27, -6, -17, 36, -45, 42, -29, 8, 15, -34, 44, -43, 30, -10, -13, 33, -44, 44, \ + -32, 12, 11, -31, 43, -44, 34, -14, -9, 30, -43, 45, -35, 16, 7, -28, 42, -45, 36, -18, -4, 26, -41, 45, -38, 20, 2, -24, 40, -45, 39, -22, \ + 32, -40, 27, -8, -13, 31, -43, 45, -38, 22, -2, -18, 35, -44, 44, -34, 17, 3, -23, 38, -45, 42, -30, 12, 9, -28, 41, -45, 40, -26, 7, 14, \ + -32, 43, -45, 37, -21, 1, 19, -36, 44, -44, 34, -16, -4, 24, -39, 45, -42, 30, -11, -10, 29, -41, 45, -39, 25, -6, -15, 33, -43, 45, -36, 20, \ + 32, -41, 30, -14, -4, 22, -36, 44, -44, 37, -23, 6, 13, -30, 41, -45, 42, -31, 15, 3, -21, 36, -44, 45, -38, 24, -7, -12, 29, -40, 45, -42, \ + 32, -16, -2, 20, -35, 44, -45, 38, -25, 8, 11, -28, 40, -45, 43, -33, 17, 1, -19, 34, -43, 45, -39, 26, -9, -10, 27, -39, 45, -43, 34, -18, \ + 32, -42, 34, -20, 4, 12, -27, 38, -44, 45, -39, 28, -13, -3, 19, -33, 42, -45, 43, -34, 21, -6, -11, 26, -38, 44, -45, 39, -29, 14, 2, -18, \ + 32, -41, 45, -43, 35, -22, 7, 10, -25, 37, -44, 45, -40, 30, -15, -1, 17, -31, 41, -45, 43, -36, 23, -8, -9, 24, -36, 44, -45, 40, -30, 16, \ + 32, -43, 36, -26, 13, 1, -15, 28, -38, 44, -45, 42, -35, 24, -11, -3, 17, -30, 39, -44, 45, -41, 34, -22, 9, 6, -19, 31, -40, 45, -45, 40, \ + -32, 20, -7, -8, 21, -33, 41, -45, 44, -39, 30, -18, 4, 10, -23, 34, -42, 45, -44, 38, -29, 16, -2, -12, 25, -36, 43, -45, 43, -37, 27, -14, \ + 32, -44, 39, -31, 21, -10, -2, 14, -25, 34, -41, 45, -45, 42, -36, 28, -17, 6, 7, -18, 29, -37, 43, -45, 44, -40, 34, -24, 13, -1, -11, 22, \ + -32, 39, -44, 45, -43, 38, -30, 20, -9, -3, 15, -26, 35, -41, 45, -45, 42, -36, 27, -16, 4, 8, -19, 30, -38, 43, -45, 44, -40, 33, -23, 12, \ + 32, -44, 41, -36, 29, -20, 11, -1, -9, 18, -27, 34, -40, 44, -45, 45, -42, 37, -30, 22, -13, 3, 7, -16, 25, -33, 39, -43, 45, -45, 43, -38, \ + 32, -24, 15, -6, -4, 14, -23, 31, -38, 42, -45, 45, -43, 39, -34, 26, -17, 8, 2, -12, 21, -30, 36, -41, 44, -45, 44, -40, 35, -28, 19, -10, \ + 32, -45, 43, -39, 35, -30, 23, -16, 9, -1, -7, 14, -21, 28, -34, 38, -42, 44, -45, 45, -43, 40, -36, 31, -25, 18, -11, 3, 4, -12, 19, -26, \ + 32, -37, 41, -44, 45, -45, 44, -41, 38, -33, 27, -20, 13, -6, -2, 10, -17, 24, -30, 36, -40, 43, -45, 45, -44, 42, -39, 34, -29, 22, -15, 8, \ + 32, -45, 44, -42, 40, -37, 34, -30, 25, -20, 15, -10, 4, 1, -7, 12, -17, 22, -27, 31, -35, 38, -41, 43, -44, 45, -45, 45, -43, 41, -39, 36, \ + -32, 28, -23, 18, -13, 8, -2, -3, 9, -14, 19, -24, 29, -33, 36, -39, 42, -44, 45, -45, 45, -44, 43, -40, 38, -34, 30, -26, 21, -16, 11, -6, \ + 32, -45, 45, -44, 43, -42, 41, -39, 38, -36, 34, -31, 29, -26, 23, -20, 17, -14, 11, -8, 4, -1, -2, 6, -9, 12, -15, 18, -21, 24, -27, 30, \ + -32, 34, -36, 38, -40, 41, -43, 44, -44, 45, -45, 45, -45, 45, -44, 43, -42, 40, -39, 37, -35, 33, -30, 28, -25, 22, -19, 16, -13, 10, -7, 3, \ + 32, -45, 45, -45, 45, -45, 45, -45, 44, -44, 44, -44, 43, -43, 43, -42, 42, -41, 41, -40, 40, -39, 39, -38, 38, -37, 36, -36, 35, -34, 34, -33, \ + 32, -31, 30, -30, 29, -28, 27, -26, 25, -24, 23, -22, 21, -20, 19, -18, 17, -16, 15, -14, 13, -12, 11, -10, 9, -8, 7, -6, 4, -3, 2, -1 + +//************************************************************************************************* +//void tx_dct2_pb64_arm64(s16 *src, s16 *dst, int line, int limit_line, int shift); +//x0: coeff blk, 16 bit +//x1: resi blk, 16 bit +//x2: blk width +//x3: limit_line +//x4: shift +//************************************************************************************************* +function tx_dct2_pb64_arm64 + lsl x2, x2, #1 + adr x7, tx_dct2_pb64_coef + mov x12, #128 //i_src + +tx_dct2_pb64_loopk: + mov x8, #0 + mov x11, x1 +tx_dct2_pb64_loopj: + mov x5, #0 + movi v24.16b, #0 + movi v25.16b, #0 + movi v26.16b, #0 + movi v27.16b, #0 + movi v28.16b, #0 + movi v29.16b, #0 + movi v30.16b, #0 + movi v31.16b, #0 +tx_dct2_pb64_loopi: + //load src + add x6, x0, x5 + ld1 {v0.8h}, [x6], x12 + ld1 {v1.8h}, [x6], x12 + ld1 {v2.8h}, [x6], x12 + ld1 {v3.8h}, [x6], x12 + + add x9, x7, x8 + ld1 {v16.8h}, [x9], x12 + ld1 {v17.8h}, [x9], x12 + ld1 {v18.8h}, [x9], x12 + ld1 {v19.8h}, [x9], x12 + ld1 {v20.8h}, [x9], x12 + ld1 {v21.8h}, [x9], x12 + ld1 {v22.8h}, [x9], x12 + ld1 {v23.8h}, [x9], x12 + + smlal v24.4s, v16.4h, v0.h[0] + smlal2 v25.4s, v16.8h, v0.h[0] + smlal v24.4s, v17.4h, v0.h[1] + smlal2 v25.4s, v17.8h, v0.h[1] + smlal v24.4s, v18.4h, v0.h[2] + smlal2 v25.4s, v18.8h, v0.h[2] + smlal v24.4s, v19.4h, v0.h[3] + smlal2 v25.4s, v19.8h, v0.h[3] + smlal v24.4s, v20.4h, v0.h[4] + smlal2 v25.4s, v20.8h, v0.h[4] + smlal v24.4s, v21.4h, v0.h[5] + smlal2 v25.4s, v21.8h, v0.h[5] + smlal v24.4s, v22.4h, v0.h[6] + smlal2 v25.4s, v22.8h, v0.h[6] + smlal v24.4s, v23.4h, v0.h[7] + smlal2 v25.4s, v23.8h, v0.h[7] + + smlal v26.4s, v16.4h, v1.h[0] + smlal2 v27.4s, v16.8h, v1.h[0] + smlal v26.4s, v17.4h, v1.h[1] + smlal2 v27.4s, v17.8h, v1.h[1] + smlal v26.4s, v18.4h, v1.h[2] + smlal2 v27.4s, v18.8h, v1.h[2] + smlal v26.4s, v19.4h, v1.h[3] + smlal2 v27.4s, v19.8h, v1.h[3] + smlal v26.4s, v20.4h, v1.h[4] + smlal2 v27.4s, v20.8h, v1.h[4] + smlal v26.4s, v21.4h, v1.h[5] + smlal2 v27.4s, v21.8h, v1.h[5] + smlal v26.4s, v22.4h, v1.h[6] + smlal2 v27.4s, v22.8h, v1.h[6] + smlal v26.4s, v23.4h, v1.h[7] + smlal2 v27.4s, v23.8h, v1.h[7] + + smlal v28.4s, v16.4h, v2.h[0] + smlal2 v29.4s, v16.8h, v2.h[0] + smlal v28.4s, v17.4h, v2.h[1] + smlal2 v29.4s, v17.8h, v2.h[1] + smlal v28.4s, v18.4h, v2.h[2] + smlal2 v29.4s, v18.8h, v2.h[2] + smlal v28.4s, v19.4h, v2.h[3] + smlal2 v29.4s, v19.8h, v2.h[3] + smlal v28.4s, v20.4h, v2.h[4] + smlal2 v29.4s, v20.8h, v2.h[4] + smlal v28.4s, v21.4h, v2.h[5] + smlal2 v29.4s, v21.8h, v2.h[5] + smlal v28.4s, v22.4h, v2.h[6] + smlal2 v29.4s, v22.8h, v2.h[6] + smlal v28.4s, v23.4h, v2.h[7] + smlal2 v29.4s, v23.8h, v2.h[7] + + smlal v30.4s, v16.4h, v3.h[0] + smlal2 v31.4s, v16.8h, v3.h[0] + smlal v30.4s, v17.4h, v3.h[1] + smlal2 v31.4s, v17.8h, v3.h[1] + smlal v30.4s, v18.4h, v3.h[2] + smlal2 v31.4s, v18.8h, v3.h[2] + smlal v30.4s, v19.4h, v3.h[3] + smlal2 v31.4s, v19.8h, v3.h[3] + smlal v30.4s, v20.4h, v3.h[4] + smlal2 v31.4s, v20.8h, v3.h[4] + smlal v30.4s, v21.4h, v3.h[5] + smlal2 v31.4s, v21.8h, v3.h[5] + smlal v30.4s, v22.4h, v3.h[6] + smlal2 v31.4s, v22.8h, v3.h[6] + smlal v30.4s, v23.4h, v3.h[7] + smlal2 v31.4s, v23.8h, v3.h[7] + + add x5, x5, #16 + add x8, x8, #1024 + cmp x5, #128 + bne tx_dct2_pb64_loopi + + cmp w4, #6 + bne tx_dct2_pb64_shift11 + sqrshrn v0.4h, v24.4s, #6 + sqrshrn v1.4h, v26.4s, #6 + sqrshrn v2.4h, v28.4s, #6 + sqrshrn v3.4h, v30.4s, #6 + sqrshrn v4.4h, v25.4s, #6 + sqrshrn v5.4h, v27.4s, #6 + sqrshrn v6.4h, v29.4s, #6 + sqrshrn v7.4h, v31.4s, #6 + b tx_dct2_pb64_end + +tx_dct2_pb64_shift11: + sqrshrn v0.4h, v24.4s, #11 + sqrshrn v1.4h, v26.4s, #11 + sqrshrn v2.4h, v28.4s, #11 + sqrshrn v3.4h, v30.4s, #11 + sqrshrn v4.4h, v25.4s, #11 + sqrshrn v5.4h, v27.4s, #11 + sqrshrn v6.4h, v29.4s, #11 + sqrshrn v7.4h, v31.4s, #11 + +tx_dct2_pb64_end: + st4 {v0.h - v3.h}[0], [x11], x2 + st4 {v0.h - v3.h}[1], [x11], x2 + st4 {v0.h - v3.h}[2], [x11], x2 + st4 {v0.h - v3.h}[3], [x11], x2 + st4 {v4.h - v7.h}[0], [x11], x2 + st4 {v4.h - v7.h}[1], [x11], x2 + st4 {v4.h - v7.h}[2], [x11], x2 + st4 {v4.h - v7.h}[3], [x11], x2 + + sub x8, x8, #4095 + sub x8, x8, #4081 //8176=4095 + 4081 + cmp x8, #128 + bne tx_dct2_pb64_loopj + + add x0, x0, #512 + add x1, x1, #8 + subs x3, x3, #4 + bgt tx_dct2_pb64_loopk + + ret + #endif #endif \ No newline at end of file diff --git a/build/android/app/src/main/jni/src/armv8/transform_arm64.c b/build/android/app/src/main/jni/src/armv8/transform_arm64.c index dccc931..7954173 100644 --- a/build/android/app/src/main/jni/src/armv8/transform_arm64.c +++ b/build/android/app/src/main/jni/src/armv8/transform_arm64.c @@ -1,9 +1,6 @@ #include "arm64.h" #if defined(__arm64__) -#define trans_test 0 - -#ifndef trans_test void uavs3e_trans_dct2_w4_h4_arm64(s16 *src, s16 *dst, int bit_depth) { ALIGNED_16(s16 tmp[4*4]); @@ -164,6 +161,5 @@ void uavs3e_trans_dct2_w64_h64_arm64(s16 *src, s16 *dst, int bit_depth) tx_dct2_pb64_arm64(src, tmp, 64, 64, 4 + bit_depth - 8); tx_dct2_pb64_arm64(tmp, dst, 64, 32, 11); } -#endif #endif \ No newline at end of file