From da4b834b5f3b38d9448c5c63b19eb986a55d54f1 Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Tue, 9 Jul 2024 02:02:05 +0800 Subject: [PATCH] refactor: Use memcpy for type punning --- Makefile | 4 +- sse2neon.h | 537 +++++++++++++++++++++++-------------------------- tests/common.h | 37 ++-- tests/impl.cpp | 198 +++++++++--------- 4 files changed, 378 insertions(+), 398 deletions(-) diff --git a/Makefile b/Makefile index 999a3a7b..aaee7fe4 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ ARCH_CFLAGS := $(ARCH_CFLAGS)+$(subst $(COMMA),+,$(FEATURE)) endif endif -CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS) -std=gnu++14 +CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS) -O3 -std=gnu++14 LDFLAGS += -lm OBJS = \ tests/binding.o \ @@ -77,7 +77,7 @@ $(EXEC): $(OBJS) check: tests/main ifeq ($(processor),$(filter $(processor),aarch64 arm64 arm armv7l)) - $(CC) $(ARCH_CFLAGS) -c sse2neon.h + $(CC) $(ARCH_CFLAGS) -c -O3 sse2neon.h endif $(EXEC_WRAPPER) $^ diff --git a/sse2neon.h b/sse2neon.h index d64a36ec..6a8a3773 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -130,6 +130,20 @@ #include #include +#include + +FORCE_INLINE double recast_f64(uint64_t u64) +{ + double f64; + memcpy(&f64, &u64, sizeof(uint64_t)); + return f64; +} +FORCE_INLINE int64_t recast_i64(double f64) +{ + int64_t i64; + memcpy(&i64, &f64, sizeof(uint64_t)); + return i64; +} #if defined(_WIN32) /* Definitions for _mm_{malloc,free} are provided by @@ -566,17 +580,6 @@ typedef union ALIGN_STRUCT(16) SIMDVec { #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode -typedef union bit64_union_t { - double f64; - int64_t i64; - uint64_t u64; -} bit64_union_t; -typedef union bit32_union_t { - float f32; - int32_t i32; - uint32_t u32; -} bit32_union_t; - // Function declaration // SSE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); @@ -2992,14 +2995,13 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = a0.f64 + b0.f64; - c[1] = a1.f64 + b1.f64; + c[0] = a0 + b0; + c[1] = a1 + b1; return vld1q_f32((float32_t *) c); #endif } @@ -3013,13 +3015,13 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_add_pd(a, b)); #else - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, a1, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); double c[2]; - c[0] = a0.f64 + b0.f64; - c[1] = a1.f64; + c[0] = a0 + b0; + c[1] = a1; return vld1q_f32((float32_t *) c); #endif } @@ -3273,14 +3275,13 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.f64 >= b1.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3296,13 +3297,13 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmpge_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.u64; + d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3344,14 +3345,13 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.f64 > b1.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3367,13 +3367,13 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.u64; + d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3388,14 +3388,13 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.f64 <= b1.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3411,13 +3410,13 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) return _mm_move_sd(a, _mm_cmple_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.u64; + d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3462,14 +3461,13 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) return vreinterpretq_m128d_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.f64 < b1.f64 ? ~UINT64_C(0) : UINT64_C(0); + d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3484,13 +3482,13 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.u64; + d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3532,14 +3530,13 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = !(a0.f64 >= b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = !(a1.f64 >= b1.f64) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3564,14 +3561,13 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = !(a0.f64 > b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = !(a1.f64 > b1.f64) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3596,14 +3592,13 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = !(a0.f64 <= b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = !(a1.f64 <= b1.f64) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3628,14 +3623,13 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = !(a0.f64 < b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = !(a1.f64 < b1.f64) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3663,14 +3657,13 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = (a1.f64 == a1.f64 && b1.f64 == b1.f64) ? ~UINT64_C(0) : UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3685,13 +3678,13 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpord_pd(a, b)); #else - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0); - d[1] = a1.u64; + d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3711,14 +3704,13 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) return vreinterpretq_m128d_s32( vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0); - d[1] = (a1.f64 == a1.f64 && b1.f64 == b1.f64) ? UINT64_C(0) : ~UINT64_C(0); + d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0); + d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3733,13 +3725,13 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); uint64_t d[2]; - d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0); - d[1] = a1.u64; + d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0); + d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -3753,10 +3745,10 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; #else - bit64_union_t a0, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - return a0.f64 >= b0.f64; + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return a0 >= b0; #endif } @@ -3768,11 +3760,11 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; #else - bit64_union_t a0, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (a0.f64 > b0.f64); + return (a0 > b0); #endif } @@ -3784,11 +3776,11 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; #else - bit64_union_t a0, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (a0.f64 <= b0.f64); + return (a0 <= b0); #endif } @@ -3800,11 +3792,11 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; #else - bit64_union_t a0, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double a0, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); - return (a0.f64 < b0.f64); + return (a0 < b0); #endif } @@ -3873,10 +3865,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) vcombine_s32(vmovn_s64(integers), vdup_n_s32(0))); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - bit64_union_t d0, d1; - d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0); - d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1); - return _mm_set_epi32(0, 0, (int32_t) d1.f64, (int32_t) d0.f64); + double d0, d1; + d0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + d1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1)); + return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); #endif } @@ -3886,10 +3878,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - bit64_union_t d0, d1; - d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0); - d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1); - int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0.f64, (int32_t) d1.f64}; + double d0, d1; + d0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + d1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1)); + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; return vreinterpret_m64_s32(vld1_s32(data)); } @@ -3903,10 +3895,10 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else - bit64_union_t a0, a1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - return _mm_set_ps(0, 0, (float) a1.f64, (float) a0.f64); + double a0, a1; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_ps(0, 0, (float) a1, (float) a0); #endif } @@ -4005,9 +3997,8 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); #else - bit64_union_t _a; - _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - return _a.f64; + double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return _a; #endif } @@ -4020,9 +4011,8 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - bit64_union_t ret; - ret.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0); - return (int32_t) ret.f64; + double ret = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + return (int32_t) ret; #endif } @@ -4035,9 +4025,8 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); - bit64_union_t ret; - ret.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0); - return (int64_t) ret.f64; + double ret = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + return (int64_t) ret; #endif } @@ -4058,10 +4047,9 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), vreinterpretq_f32_m128(a), 0)); #else - bit64_union_t b0; - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); return vreinterpretq_m128_f32( - vsetq_lane_f32((float) b0.f64, vreinterpretq_f32_m128(a), 0)); + vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0)); #endif } @@ -4093,10 +4081,9 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else - bit64_union_t bf; - bf.f64 = (double) b; + int64_t _b = recast_i64((double) b); return vreinterpretq_m128d_s64( - vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4122,10 +4109,9 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else - bit64_union_t bf; - bf.f64 = (double) b; + int64_t bf = recast_i64((double) b); return vreinterpretq_m128d_s64( - vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(bf, vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4155,14 +4141,13 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { - bit64_union_t d; - d.f64 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( - vsetq_lane_f64(d.f64, vreinterpretq_f64_m128d(a), 0)); + vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_s64( - vsetq_lane_s64(d.i64, vreinterpretq_s64_m128d(a), 0)); + vsetq_lane_s64(recast_i64(d), vreinterpretq_s64_m128d(a), 0)); #endif } @@ -4171,10 +4156,10 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) { - bit64_union_t a0, a1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - return _mm_set_epi32(0, 0, (int32_t) a1.f64, (int32_t) a0.f64); + double a0, a1; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); } // Convert packed double-precision (64-bit) floating-point elements in a to @@ -4182,10 +4167,10 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { - bit64_union_t a0, a1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0.f64, (int32_t) a1.f64}; + double a0, a1; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; return vreinterpret_m64_s32(vld1_s32(data)); } @@ -4202,9 +4187,8 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) { - bit64_union_t _a; - _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - return (int32_t) _a.f64; + double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return (int32_t) _a; } // Convert the lower double-precision (64-bit) floating-point element in a to a @@ -4215,9 +4199,8 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); #else - bit64_union_t _a; - _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - return (int64_t) _a.f64; + double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return (int64_t) _a; #endif } @@ -4235,14 +4218,13 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = a0.f64 / b0.f64; - c[1] = a1.f64 / b1.f64; + c[0] = a0 / b0; + c[1] = a1 / b1; return vld1q_f32((float32_t *) c); #endif } @@ -4487,14 +4469,13 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = a0.f64 > b0.f64 ? a0.u64 : b0.u64; - d[1] = a1.f64 > b1.f64 ? a1.u64 : b1.u64; + d[0] = a0 > b0 ? recast_i64(a0) : recast_i64(b0); + d[1] = a1 > b1 ? recast_i64(a1) : recast_i64(b1); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif @@ -4509,11 +4490,11 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_max_pd(a, b)); #else - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - double c[2] = {a0.f64 > b0.f64 ? a0.f64 : b0.f64, a1.f64}; + double a0, a1, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2] = {a0 > b0 ? a0 : b0, a1}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } @@ -4551,14 +4532,13 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); uint64_t d[2]; - d[0] = a0.f64 < b0.f64 ? a0.u64 : b0.u64; - d[1] = a1.f64 < b1.f64 ? a1.u64 : b1.u64; + d[0] = a0 < b0 ? recast_i64(a0) : recast_i64(b0); + d[1] = a1 < b1 ? recast_i64(a1) : recast_i64(b1); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } @@ -4572,11 +4552,11 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_min_pd(a, b)); #else - bit64_union_t a0, a1, b0; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - double c[2] = {a0.f64 < b0.f64 ? a0.f64 : b0.f64, a1.f64}; + double a0, a1, b0; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2] = {a0 < b0 ? a0 : b0, a1}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } @@ -4731,14 +4711,13 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = a0.f64 * b0.f64; - c[1] = a1.f64 * b1.f64; + c[0] = a0 * b0; + c[1] = a1 * b1; return vld1q_f32((float32_t *) c); #endif } @@ -5028,9 +5007,8 @@ FORCE_INLINE __m128d _mm_set1_pd(double d) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else - bit64_union_t _d; - _d.f64 = d; - return vreinterpretq_m128d_s64(vdupq_n_s64(_d.i64)); + int64_t _d = recast_i64(d); + return vreinterpretq_m128d_s64(vdupq_n_s64(_d)); #endif } @@ -5321,11 +5299,11 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); #else - bit64_union_t a0, a1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - double _a0 = sqrt(a0.f64); - double _a1 = sqrt(a1.f64); + double a0, a1; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double _a0 = sqrt(a0); + double _a1 = sqrt(a1); return _mm_set_pd(_a1, _a0); #endif } @@ -5339,10 +5317,10 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_sqrt_pd(b)); #else - bit64_union_t _a, _b; - _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - _b.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - return _mm_set_pd(_a.f64, sqrt(_b.f64)); + double _a, _b; + _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + _b = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return _mm_set_pd(_a, sqrt(_b)); #endif } @@ -5697,14 +5675,13 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); double c[2]; - c[0] = a0.f64 - b0.f64; - c[1] = a1.f64 - b1.f64; + c[0] = a0 - b0; + c[1] = a1 - b1; return vld1q_f32((float32_t *) c); #endif } @@ -6008,12 +5985,11 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); - double c[] = {a0.f64 + a1.f64, b0.f64 + b1.f64}; + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[] = {a0 + a1, b0 + b1}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } @@ -6047,12 +6023,11 @@ FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b) return vreinterpretq_m128d_f64( vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b))); #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); - double c[] = {a0.f64 - a1.f64, b0.f64 - b1.f64}; + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[] = {a0 - a1, b0 - b1}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } @@ -6848,10 +6823,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else - bit64_union_t a0, a1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - return _mm_set_pd(ceil(a1.f64), ceil(a0.f64)); + double a0, a1; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_pd(ceil(a1), ceil(a0)); #endif } @@ -7059,13 +7034,12 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) : 0; #else - bit64_union_t a0, a1, b0, b1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0); - b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1); - double d0 = (imm & 0x10) ? a0.f64 * b0.f64 : 0; - double d1 = (imm & 0x20) ? a1.f64 * b1.f64 : 0; + double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double d0 = (imm & 0x10) ? a0 * b0 : 0; + double d1 = (imm & 0x20) ? a1 * b1 : 0; #endif __m128d tmp = _mm_set_pd(d1, d0); #endif @@ -7073,10 +7047,9 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) #if defined(__aarch64__) || defined(_M_ARM64) double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); #else - bit64_union_t _tmp0, _tmp1; - _tmp0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0); - _tmp1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1); - double sum = _tmp0.f64 + _tmp1.f64; + double _tmp0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0)); + double _tmp1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1)); + double sum = _tmp0 + _tmp1; #endif // Conditionally store the sum const __m128d sumMask = @@ -7166,10 +7139,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a) #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else - bit64_union_t a0, a1; - a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0); - a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); - return _mm_set_pd(floor(a1.f64), floor(a0.f64)); + double a0, a1; + a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_pd(floor(a1), floor(a0)); #endif } diff --git a/tests/common.h b/tests/common.h index 1d6dc85c..7978d7b3 100644 --- a/tests/common.h +++ b/tests/common.h @@ -65,27 +65,38 @@ enum result_t { extern int32_t NaN; extern int64_t NaN64; -typedef union bit64_union_t { +#include +static inline double sse2neon_tool_recast_f64(uint64_t u64) +{ double f64; + memcpy(&f64, &u64, sizeof(uint64_t)); + return f64; +} +static inline int64_t sse2neon_tool_recast_i64(double f64) +{ int64_t i64; - uint64_t u64; -} bit64_union_t; -typedef union bit32_union_t { + memcpy(&i64, &f64, sizeof(int64_t)); + return i64; +} +static inline float sse2neon_tool_recast_f32(uint32_t u32) +{ float f32; - int32_t i32; - uint32_t u32; -} bit32_union_t; + memcpy(&f32, &u32, sizeof(uint32_t)); + return f32; +} +static inline float sse2neon_tool_recast_f32(int32_t i32) +{ + float f32; + memcpy(&f32, &i32, sizeof(int32_t)); + return f32; +} static inline float generate_all_ones_float() { - bit32_union_t u; - u.i32 = UINT32_MAX; - return u.f32; + return sse2neon_tool_recast_f32(UINT32_MAX); } static inline double generate_all_ones_double() { - bit64_union_t u; - u.i64 = UINT64_MAX; - return u.f64; + return sse2neon_tool_recast_f64(UINT64_MAX); } #define ALL_BIT_1_32 generate_all_ones_float() #define ALL_BIT_1_64 generate_all_ones_double() diff --git a/tests/impl.cpp b/tests/impl.cpp index 242b6660..839a93d3 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -3256,17 +3256,16 @@ result_t test_mm_xor_ps(const SSE2NEONTestImpl &impl, uint32_t iter) const int32_t *_a = (const int32_t *) impl.mTestFloatPointer1; const int32_t *_b = (const int32_t *) impl.mTestFloatPointer2; - bit32_union_t d0, d1, d2, d3; - d0.i32 = _a[0] ^ _b[0]; - d1.i32 = _a[1] ^ _b[1]; - d2.i32 = _a[2] ^ _b[2]; - d3.i32 = _a[3] ^ _b[3]; + float d0 = sse2neon_tool_recast_f32(_a[0] ^ _b[0]); + float d1 = sse2neon_tool_recast_f32(_a[1] ^ _b[1]); + float d2 = sse2neon_tool_recast_f32(_a[2] ^ _b[2]); + float d3 = sse2neon_tool_recast_f32(_a[3] ^ _b[3]); __m128 a = load_m128(_a); __m128 b = load_m128(_b); __m128 c = _mm_xor_ps(a, b); - return validateFloat(c, d0.f32, d1.f32, d2.f32, d3.f32); + return validateFloat(c, d0, d1, d2, d3); } /* SSE2 */ @@ -3553,15 +3552,14 @@ result_t test_mm_and_pd(const SSE2NEONTestImpl &impl, uint32_t iter) const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1; const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.i64 = _a[0] & _b[0]; - d1.i64 = _a[1] & _b[1]; + double d0 = sse2neon_tool_recast_f64(_a[0] & _b[0]); + double d1 = sse2neon_tool_recast_f64(_a[1] & _b[1]); __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_and_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_and_si128(const SSE2NEONTestImpl &impl, uint32_t iter) @@ -3833,59 +3831,58 @@ result_t test_mm_cmpeq_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] == _b[0]) ? UINT64_MAX : 0; - d1.u64 = (_a[1] == _b[1]) ? UINT64_MAX : 0; + + double d0 = (_a[0] == _b[0]) ? sse2neon_tool_recast_f64(UINT64_MAX) : 0; + double d1 = (_a[1] == _b[1]) ? sse2neon_tool_recast_f64(UINT64_MAX) : 0; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpeq_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpeq_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] == _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((const uint64_t *) _a)[1]; + double d0 = (_a[0] == _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpeq_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpge_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = (_a[1] >= _b[1]) ? ~UINT64_C(0) : 0; + + double d0 = (_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = (_a[1] >= _b[1]) ? ALL_BIT_1_64 : 0; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpge_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpge_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((uint64_t *) _a)[1]; + + double d0 = (_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpge_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpgt_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) @@ -3959,60 +3956,60 @@ result_t test_mm_cmpgt_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = (_a[1] > _b[1]) ? ~UINT64_C(0) : 0; + + double d0 = (_a[0] > _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = (_a[1] > _b[1]) ? ALL_BIT_1_64 : 0; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpgt_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpgt_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((uint64_t *) _a)[1]; + + double d0 = (_a[0] > _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpgt_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmple_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = (_a[1] <= _b[1]) ? ~UINT64_C(0) : 0; + + double d0 = (_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = (_a[1] <= _b[1]) ? ALL_BIT_1_64 : 0; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmple_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmple_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((uint64_t *) _a)[1]; + + double d0 = (_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmple_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmplt_epi16(const SSE2NEONTestImpl &impl, uint32_t iter) @@ -4085,180 +4082,180 @@ result_t test_mm_cmplt_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] < _b[0]) ? ~UINT64_C(0) : UINT64_C(0); - d1.u64 = (_a[1] < _b[1]) ? ~UINT64_C(0) : UINT64_C(0); + + double d0 = (_a[0] < _b[0]) ? ALL_BIT_1_64 : UINT64_C(0); + double d1 = (_a[1] < _b[1]) ? ALL_BIT_1_64 : UINT64_C(0); __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmplt_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmplt_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] < _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((uint64_t *) _a)[1]; + + double d0 = (_a[0] < _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmplt_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpneq_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0); - d1.u64 = (_a[1] != _b[1]) ? ~UINT64_C(0) : UINT64_C(0); + + double d0 = (_a[0] != _b[0]) ? ALL_BIT_1_64 : UINT64_C(0); + double d1 = (_a[1] != _b[1]) ? ALL_BIT_1_64 : UINT64_C(0); __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpneq_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpneq_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0); - d1.u64 = ((int64_t *) _a)[1]; + + double d0 = (_a[0] != _b[0]) ? ALL_BIT_1_64 : UINT64_C(0); + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpneq_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpnge_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = !(_a[1] >= _b[1]) ? ~UINT64_C(0) : 0; + + double d0 = !(_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = !(_a[1] >= _b[1]) ? ALL_BIT_1_64 : 0; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpnge_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpnge_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((uint64_t *) _a)[1]; + + double d0 = !(_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpnge_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpngt_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = !(_a[1] > _b[1]) ? ~UINT64_C(0) : 0; + + double d0 = !(_a[0] > _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = !(_a[1] > _b[1]) ? ALL_BIT_1_64 : 0; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpngt_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpngt_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((uint64_t *) _a)[1]; + + double d0 = !(_a[0] > _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpngt_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpnle_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = !(_a[1] <= _b[1]) ? ~UINT64_C(0) : 0; + + double d0 = !(_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = !(_a[1] <= _b[1]) ? ALL_BIT_1_64 : 0; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpnle_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpnle_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((uint64_t *) _a)[1]; + + double d0 = !(_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpnle_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpnlt_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; const double *_b = (const double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = !(_a[1] < _b[1]) ? ~UINT64_C(0) : 0; + + double d0 = !(_a[0] < _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = !(_a[1] < _b[1]) ? ALL_BIT_1_64 : 0; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpnlt_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpnlt_sd(const SSE2NEONTestImpl &impl, uint32_t iter) { double *_a = (double *) impl.mTestFloatPointer1; double *_b = (double *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.u64 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0; - d1.u64 = ((uint64_t *) _a)[1]; + + double d0 = !(_a[0] < _b[0]) ? ALL_BIT_1_64 : 0; + double d1 = _a[1]; __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_cmpnlt_sd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_cmpord_pd(const SSE2NEONTestImpl &impl, uint32_t iter) @@ -5523,15 +5520,15 @@ result_t test_mm_or_pd(const SSE2NEONTestImpl &impl, uint32_t iter) const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1; const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.i64 = _a[0] | _b[0]; - d1.i64 = _a[1] | _b[1]; + + double d0 = sse2neon_tool_recast_f64(_a[0] | _b[0]); + double d1 = sse2neon_tool_recast_f64(_a[1] | _b[1]); __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_or_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_or_si128(const SSE2NEONTestImpl &impl, uint32_t iter) @@ -7112,15 +7109,15 @@ result_t test_mm_xor_pd(const SSE2NEONTestImpl &impl, uint32_t iter) const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1; const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2; - bit64_union_t d0, d1; - d0.i64 = _a[0] ^ _b[0]; - d1.i64 = _a[1] ^ _b[1]; + + double d0 = sse2neon_tool_recast_f64(_a[0] ^ _b[0]); + double d1 = sse2neon_tool_recast_f64(_a[1] ^ _b[1]); __m128d a = load_m128d(_a); __m128d b = load_m128d(_b); __m128d c = _mm_xor_pd(a, b); - return validateDouble(c, d0.f64, d1.f64); + return validateDouble(c, d0, d1); } result_t test_mm_xor_si128(const SSE2NEONTestImpl &impl, uint32_t iter) @@ -8115,9 +8112,8 @@ result_t test_mm_blendv_pd(const SSE2NEONTestImpl &impl, uint32_t iter) for (int i = 0; i < 2; i++) { // signed shift right would return a result which is either all 1's from // negative numbers or all 0's from positive numbers - bit64_union_t m; - m.f64 = _mask[i]; - if (m.i64 >> 63) { + int64_t m = sse2neon_tool_recast_i64(_mask[i]); + if (m >> 63) { _c[i] = _b[i]; } else { _c[i] = _a[i];