From 72eba3a06a7ab691f55ca2ac2cedcb0c2bc90728 Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Fri, 28 Jun 2024 15:24:26 +0800 Subject: [PATCH] wip --- sse2neon.h | 304 +++++++++++++++++++++++++++++------------------------ 1 file changed, 168 insertions(+), 136 deletions(-) diff --git a/sse2neon.h b/sse2neon.h index 3d0713f4..efd29365 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -3274,8 +3274,8 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; - d[0] = (a0.f64) >= (b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (a1.f64) >= (b1.f64) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1.f64 >= b1.f64 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3291,12 +3291,13 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmpge_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); uint64_t d[2]; - d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1; + d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1.u64; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3338,13 +3339,14 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1.f64 > b1.f64 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3360,12 +3362,13 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1; + d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1.u64; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3380,13 +3383,14 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; - d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1.f64 <= b1.f64 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3402,12 +3406,13 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmple_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); uint64_t d[2]; - d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1; + d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1.u64; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3452,13 +3457,14 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1.f64 < b1.f64 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3473,12 +3479,13 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1; + d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1.u64; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3520,15 +3527,16 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; d[0] = - !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + !(a0.f64 >= b0.f64) ? ~UINT64_C(0) : UINT64_C(0); d[1] = - !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + !(a1.f64 >= b1.f64) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3553,15 +3561,16 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; d[0] = - !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + !(a0.f64 > b0.f64) ? ~UINT64_C(0) : UINT64_C(0); d[1] = - !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + !(a1.f64 > b1.f64) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3586,15 +3595,16 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; d[0] = - !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + !(a0.f64 <= b0.f64) ? ~UINT64_C(0) : UINT64_C(0); d[1] = - !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + !(a1.f64 <= b1.f64) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3619,15 +3629,16 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; d[0] = - !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); + !(a0.f64 < b0.f64) ? ~UINT64_C(0) : UINT64_C(0); d[1] = - !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); + !(a1.f64 < b1.f64) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3655,17 +3666,18 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) + d[0] = (a0.f64 == a0.f64 && + b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = ((*(double *) &a1) == (*(double *) &a1) && - (*(double *) &b1) == (*(double *) &b1)) + d[1] = (a1.f64 == a1.f64 && + b1.f64 == b1.f64) ? ~UINT64_C(0) : UINT64_C(0); @@ -3682,15 +3694,16 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpord_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) + d[0] = (a0.f64 == a0.f64 && + b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1; + d[1] = a1.u64; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3710,17 +3723,18 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) return vreinterpretq_m128d_s32( vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) + d[0] = (a0.f64 == a0.f64 && + b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0); - d[1] = ((*(double *) &a1) == (*(double *) &a1) && - (*(double *) &b1) == (*(double *) &b1)) + d[1] = (a1.f64 == a1.f64 && + b1.f64 == b1.f64) ? UINT64_C(0) : ~UINT64_C(0); @@ -3737,15 +3751,16 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + bit64_union_t a0, a1, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); uint64_t d[2]; - d[0] = ((*(double *) &a0) == (*(double *) &a0) && - (*(double *) &b0) == (*(double *) &b0)) + d[0] = (a0.f64 == a0.f64 && + b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0); - d[1] = a1; + d[1] = a1.u64; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3758,11 +3773,12 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; -#else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); +#else + bit64_union_t a0, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - return (*(double *) &a0 >= *(double *) &b0); + return (a0.f64 >= b0.f64); #endif } @@ -3774,10 +3790,11 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - return (*(double *) &a0 > *(double *) &b0); + return (a0.f64 > b0.f64); #endif } @@ -3789,10 +3806,11 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - return (*(double *) &a0 <= *(double *) &b0); + return (a0.f64 <= b0.f64); #endif } @@ -3804,10 +3822,11 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, b0; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - return (*(double *) &a0 < *(double *) &b0); + return (a0.f64 < b0.f64); #endif } @@ -3904,9 +3923,10 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else - float a0 = (float) ((double *) &a)[0]; - float a1 = (float) ((double *) &a)[1]; - return _mm_set_ps(0, 0, a1, a0); + bit64_union_t a0, a1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + return _mm_set_ps(0, 0, (float)a1.f64, (float)a0.f64); #endif } @@ -4018,8 +4038,9 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double ret = ((double *) &rnd)[0]; - return (int32_t) ret; + bit64_union_t rnd; + rnd.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + return (int32_t) rnd.f64; #endif } @@ -4032,8 +4053,9 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - double ret = ((double *) &rnd)[0]; - return (int64_t) ret; + bit64_union_t rnd; + rnd.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + return (int64_t) rnd.f64; #endif } @@ -4054,7 +4076,9 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), vreinterpretq_f32_m128(a), 0)); #else - return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], + bit64_union_t b0; + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + return vreinterpretq_m128_f32(vsetq_lane_f32((float) b0.f64, vreinterpretq_f32_m128(a), 0)); #endif } @@ -4087,9 +4111,10 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else - double bf = (double) b; + bit64_union_t bf; + bf.f64 = (double) b; return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4115,9 +4140,10 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else - double bf = (double) b; + bit64_union_t bf; + bf.f64 = (double) b; return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4147,13 +4173,14 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { - double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + bit64_union_t d; + d.f64 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( - vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); + vsetq_lane_f64(d.f64, vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_s64( - vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(d.i64, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4172,9 +4199,10 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { - double a0 = ((double *) &a)[0]; - double a1 = ((double *) &a)[1]; - int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; + bit64_union_t a0, a1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0.f64, (int32_t) a1.f64}; return vreinterpret_m64_s32(vld1_s32(data)); } @@ -4471,13 +4499,14 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; - d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; - d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; + d[0] = a0.f64 > b0.f64 ? a0.u64 : b0.u64; + d[1] = a1.f64 > b1.f64 ? a1.u64 : b1.u64; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -4532,13 +4561,14 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else - uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); - uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); - uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); - uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + bit64_union_t a0, a1, b0, b1; + a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); + a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); uint64_t d[2]; - d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; - d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; + d[0] = a0.f64 < b0.f64 ? a0.u64 : b0.u64; + d[1] = a1.f64 < b1.f64 ? a1.u64 : b1.u64; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } @@ -5003,7 +5033,9 @@ FORCE_INLINE __m128d _mm_set1_pd(double d) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else - return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); + bit64_union_t _d; + _d.f64 = d; + return vreinterpretq_m128d_s64(vdupq_n_s64(_d.i64)); #endif }