From 14690a66cbd0faadb8f7b3edd2b27388eae72ea5 Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Fri, 28 Jun 2024 18:28:50 +0800 Subject: [PATCH] wip --- .github/workflows/main.yml | 4 +- sse2neon.h | 111 +++++++++++++++++++++++-------------- 2 files changed, 71 insertions(+), 44 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a143629c..f2a7cb46 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -57,9 +57,9 @@ jobs: matrix: arch_with_features: [ # {arch: armv7, feature: none, arch_cflags: none}, - {arch: aarch64, feature: none, arch_cflags: -O3}, + # {arch: aarch64, feature: none, arch_cflags: none}, # {arch: aarch64, feature: crypto+crc, arch_cflags: none}, - # {arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8'} + {arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8 -O3'} ] cxx_compiler: [g++-10, clang++-11] steps: diff --git a/sse2neon.h b/sse2neon.h index 0791cfbd..060aad69 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -2992,11 +2992,14 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1] + db[1]; + c[0] = a0.f64 + b0.f64; + c[1] = a1.f64 + b1.f64; return vld1q_f32((float32_t *) c); #endif } @@ -3010,11 +3013,13 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_add_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1]; + c[0] = a0.f64 + b0.f64; + c[1] = a1.f64; return vld1q_f32((float32_t *) c); #endif } @@ -3868,9 +3873,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) vcombine_s32(vmovn_s64(integers), vdup_n_s32(0))); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double d0 = ((double *) &rnd)[0]; - double d1 = ((double *) &rnd)[1]; - return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); + bit64_union_t d0, d1; + d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0); + d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1); + return _mm_set_epi32(0, 0, (int32_t) d1.f64, (int32_t) d0.f64); #endif } @@ -3880,9 +3886,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double d0 = ((double *) &rnd)[0]; - double d1 = ((double *) &rnd)[1]; - int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; + bit64_union_t d0, d1; + d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0); + d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1); + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0.f64, (int32_t) d1.f64}; return vreinterpret_m64_s32(vld1_s32(data)); } @@ -4502,9 +4509,11 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_max_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]}; + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double c[2] = {a0.f64 > b0.f64 ? a0.f64 : b0.f64, a1.f64}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } @@ -4563,9 +4572,11 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_min_pd(a, b)); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]}; + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double c[2] = {a0.f64 < b0.f64 ? a0.f64 : b0.f64, a1.f64}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } @@ -4720,11 +4731,14 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); double c[2]; - c[0] = da[0] * db[0]; - c[1] = da[1] * db[1]; + c[0] = a0.f64 * b0.f64; + c[1] = a1.f64 * b1.f64; return vld1q_f32((float32_t *) c); #endif } @@ -5683,11 +5697,14 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); double c[2]; - c[0] = da[0] - db[0]; - c[1] = da[1] - db[1]; + c[0] = a0.f64 - b0.f64; + c[1] = a1.f64 - b1.f64; return vld1q_f32((float32_t *) c); #endif } @@ -5991,9 +6008,12 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - double *da = (double *) &a; - double *db = (double *) &b; - double c[] = {da[0] + da[1], db[0] + db[1]}; + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double c[] = {a0.f64 + a1.f64, b0.f64 + b1.f64}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } @@ -6019,17 +6039,20 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) // Horizontally subtract adjacent pairs of double-precision (64-bit) // floating-point elements in a and b, and pack the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd -FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) +FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) - float64x2_t a = vreinterpretq_f64_m128d(_a); - float64x2_t b = vreinterpretq_f64_m128d(_b); + float64x2_t _a = vreinterpretq_f64_m128d(a); + float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64( - vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b))); + vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b))); #else - double *da = (double *) &_a; - double *db = (double *) &_b; - double c[] = {da[0] - da[1], db[0] - db[1]}; + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double c[] = {a0.f64 - a1.f64, b0.f64 - b1.f64}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } @@ -6825,8 +6848,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else - double *f = (double *) &a; - return _mm_set_pd(ceil(f[1]), ceil(f[0])); + bit64_union_t a0, a1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + return _mm_set_pd(ceil(a1.f64), ceil(a0.f64)); #endif } @@ -7141,8 +7166,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else - double *f = (double *) &a; - return _mm_set_pd(floor(f[1]), floor(f[0])); + bit64_union_t a0, a1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + return _mm_set_pd(floor(a1.f64), floor(a0.f64)); #endif }