From da4b834b5f3b38d9448c5c63b19eb986a55d54f1 Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Tue, 9 Jul 2024 02:02:05 +0800
Subject: [PATCH] refactor: Use memcpy for type punning

---
 Makefile       |   4 +-
 sse2neon.h     | 537 +++++++++++++++++++++++--------------------------
 tests/common.h |  37 ++--
 tests/impl.cpp | 198 +++++++++---------
 4 files changed, 378 insertions(+), 398 deletions(-)

diff --git a/Makefile b/Makefile
index 999a3a7b..aaee7fe4 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,7 @@ ARCH_CFLAGS := $(ARCH_CFLAGS)+$(subst $(COMMA),+,$(FEATURE))
 endif
 endif
 
-CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS) -std=gnu++14
+CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS) -O3 -std=gnu++14
 LDFLAGS	+= -lm
 OBJS = \
     tests/binding.o \
@@ -77,7 +77,7 @@ $(EXEC): $(OBJS)
 
 check: tests/main
 ifeq ($(processor),$(filter $(processor),aarch64 arm64 arm armv7l))
-	$(CC) $(ARCH_CFLAGS) -c sse2neon.h
+	$(CC) $(ARCH_CFLAGS) -c -O3 sse2neon.h
 endif
 	$(EXEC_WRAPPER) $^
 
diff --git a/sse2neon.h b/sse2neon.h
index d64a36ec..6a8a3773 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -130,6 +130,20 @@
 
 #include <stdint.h>
 #include <stdlib.h>
+#include <string.h>
+
+FORCE_INLINE double recast_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+FORCE_INLINE int64_t recast_i64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
+}
 
 #if defined(_WIN32)
 /* Definitions for _mm_{malloc,free} are provided by <malloc.h>
@@ -566,17 +580,6 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
 
-typedef union bit64_union_t {
-    double f64;
-    int64_t i64;
-    uint64_t u64;
-} bit64_union_t;
-typedef union bit32_union_t {
-    float f32;
-    int32_t i32;
-    uint32_t u32;
-} bit32_union_t;
-
 // Function declaration
 // SSE
 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
@@ -2992,14 +2995,13 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = a0.f64 + b0.f64;
-    c[1] = a1.f64 + b1.f64;
+    c[0] = a0 + b0;
+    c[1] = a1 + b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -3013,13 +3015,13 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_add_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, a1, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     double c[2];
-    c[0] = a0.f64 + b0.f64;
-    c[1] = a1.f64;
+    c[0] = a0 + b0;
+    c[1] = a1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -3273,14 +3275,13 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.f64 >= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3296,13 +3297,13 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3344,14 +3345,13 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.f64 > b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3367,13 +3367,13 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3388,14 +3388,13 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.f64 <= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3411,13 +3410,13 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3462,14 +3461,13 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.f64 < b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3484,13 +3482,13 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3532,14 +3530,13 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = !(a0.f64 >= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = !(a1.f64 >= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3564,14 +3561,13 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = !(a0.f64 > b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = !(a1.f64 > b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3596,14 +3592,13 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = !(a0.f64 <= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = !(a1.f64 <= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3628,14 +3623,13 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = !(a0.f64 < b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = !(a1.f64 < b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3663,14 +3657,13 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (a1.f64 == a1.f64 && b1.f64 == b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3685,13 +3678,13 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3711,14 +3704,13 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_s32(
         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0);
-    d[1] = (a1.f64 == a1.f64 && b1.f64 == b1.f64) ? UINT64_C(0) : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3733,13 +3725,13 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3753,10 +3745,10 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
-    bit64_union_t a0, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    return a0.f64 >= b0.f64;
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 >= b0;
 #endif
 }
 
@@ -3768,11 +3760,11 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
-    bit64_union_t a0, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (a0.f64 > b0.f64);
+    return (a0 > b0);
 #endif
 }
 
@@ -3784,11 +3776,11 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
-    bit64_union_t a0, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (a0.f64 <= b0.f64);
+    return (a0 <= b0);
 #endif
 }
 
@@ -3800,11 +3792,11 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
-    bit64_union_t a0, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (a0.f64 < b0.f64);
+    return (a0 < b0);
 #endif
 }
 
@@ -3873,10 +3865,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
         vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    bit64_union_t d0, d1;
-    d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
-    d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1);
-    return _mm_set_epi32(0, 0, (int32_t) d1.f64, (int32_t) d0.f64);
+    double d0, d1;
+    d0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
 #endif
 }
 
@@ -3886,10 +3878,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
 FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
 {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    bit64_union_t d0, d1;
-    d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
-    d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1);
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0.f64, (int32_t) d1.f64};
+    double d0, d1;
+    d0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
 
@@ -3903,10 +3895,10 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    return _mm_set_ps(0, 0, (float) a1.f64, (float) a0.f64);
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_ps(0, 0, (float) a1, (float) a0);
 #endif
 }
 
@@ -4005,9 +3997,8 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
 #else
-    bit64_union_t _a;
-    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    return _a.f64;
+    double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _a;
 #endif
 }
 
@@ -4020,9 +4011,8 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    bit64_union_t ret;
-    ret.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
-    return (int32_t) ret.f64;
+    double ret = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return (int32_t) ret;
 #endif
 }
 
@@ -4035,9 +4025,8 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    bit64_union_t ret;
-    ret.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
-    return (int64_t) ret.f64;
+    double ret = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return (int64_t) ret;
 #endif
 }
 
@@ -4058,10 +4047,9 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
 #else
-    bit64_union_t b0;
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b0.f64, vreinterpretq_f32_m128(a), 0));
+        vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0));
 #endif
 }
 
@@ -4093,10 +4081,9 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    bit64_union_t bf;
-    bf.f64 = (double) b;
+    int64_t _b = recast_i64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4122,10 +4109,9 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    bit64_union_t bf;
-    bf.f64 = (double) b;
+    int64_t bf = recast_i64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(bf, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4155,14 +4141,13 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
-    bit64_union_t d;
-    d.f64 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(d.f64, vreinterpretq_f64_m128d(a), 0));
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(d.i64, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(recast_i64(d), vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4171,10 +4156,10 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    return _mm_set_epi32(0, 0, (int32_t) a1.f64, (int32_t) a0.f64);
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
 }
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
@@ -4182,10 +4167,10 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0.f64, (int32_t) a1.f64};
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
 
@@ -4202,9 +4187,8 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 {
-    bit64_union_t _a;
-    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    return (int32_t) _a.f64;
+    double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
@@ -4215,9 +4199,8 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
-    bit64_union_t _a;
-    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    return (int64_t) _a.f64;
+    double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int64_t) _a;
 #endif
 }
 
@@ -4235,14 +4218,13 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = a0.f64 / b0.f64;
-    c[1] = a1.f64 / b1.f64;
+    c[0] = a0 / b0;
+    c[1] = a1 / b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4487,14 +4469,13 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 > b0.f64 ? a0.u64 : b0.u64;
-    d[1] = a1.f64 > b1.f64 ? a1.u64 : b1.u64;
+    d[0] = a0 > b0 ? recast_i64(a0) : recast_i64(b0);
+    d[1] = a1 > b1 ? recast_i64(a1) : recast_i64(b1);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -4509,11 +4490,11 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_max_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    double c[2] = {a0.f64 > b0.f64 ? a0.f64 : b0.f64, a1.f64};
+    double a0, a1, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 > b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4551,14 +4532,13 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 < b0.f64 ? a0.u64 : b0.u64;
-    d[1] = a1.f64 < b1.f64 ? a1.u64 : b1.u64;
+    d[0] = a0 < b0 ? recast_i64(a0) : recast_i64(b0);
+    d[1] = a1 < b1 ? recast_i64(a1) : recast_i64(b1);
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
@@ -4572,11 +4552,11 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_min_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    double c[2] = {a0.f64 < b0.f64 ? a0.f64 : b0.f64, a1.f64};
+    double a0, a1, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 < b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4731,14 +4711,13 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = a0.f64 * b0.f64;
-    c[1] = a1.f64 * b1.f64;
+    c[0] = a0 * b0;
+    c[1] = a1 * b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -5028,9 +5007,8 @@ FORCE_INLINE __m128d _mm_set1_pd(double d)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
-    bit64_union_t _d;
-    _d.f64 = d;
-    return vreinterpretq_m128d_s64(vdupq_n_s64(_d.i64));
+    int64_t _d = recast_i64(d);
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
 #endif
 }
 
@@ -5321,11 +5299,11 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    double _a0 = sqrt(a0.f64);
-    double _a1 = sqrt(a1.f64);
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double _a0 = sqrt(a0);
+    double _a1 = sqrt(a1);
     return _mm_set_pd(_a1, _a0);
 #endif
 }
@@ -5339,10 +5317,10 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
-    bit64_union_t _a, _b;
-    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    _b.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    return _mm_set_pd(_a.f64, sqrt(_b.f64));
+    double _a, _b;
+    _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    _b = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return _mm_set_pd(_a, sqrt(_b));
 #endif
 }
 
@@ -5697,14 +5675,13 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = a0.f64 - b0.f64;
-    c[1] = a1.f64 - b1.f64;
+    c[0] = a0 - b0;
+    c[1] = a1 - b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -6008,12 +5985,11 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
-    double c[] = {a0.f64 + a1.f64, b0.f64 + b1.f64};
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 + a1, b0 + b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -6047,12 +6023,11 @@ FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
-    double c[] = {a0.f64 - a1.f64, b0.f64 - b1.f64};
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 - a1, b0 - b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -6848,10 +6823,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    return _mm_set_pd(ceil(a1.f64), ceil(a0.f64));
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(ceil(a1), ceil(a0));
 #endif
 }
 
@@ -7059,13 +7034,12 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
                              : 0;
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
-    double d0 = (imm & 0x10) ? a0.f64 * b0.f64 : 0;
-    double d1 = (imm & 0x20) ? a1.f64 * b1.f64 : 0;
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double d0 = (imm & 0x10) ? a0 * b0 : 0;
+    double d1 = (imm & 0x20) ? a1 * b1 : 0;
 #endif
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
@@ -7073,10 +7047,9 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 #if defined(__aarch64__) || defined(_M_ARM64)
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
-    bit64_union_t _tmp0, _tmp1;
-    _tmp0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0);
-    _tmp1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1);
-    double sum = _tmp0.f64 + _tmp1.f64;
+    double _tmp0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
+    double _tmp1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
+    double sum = _tmp0 + _tmp1;
 #endif
     // Conditionally store the sum
     const __m128d sumMask =
@@ -7166,10 +7139,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    return _mm_set_pd(floor(a1.f64), floor(a0.f64));
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(floor(a1), floor(a0));
 #endif
 }
 
diff --git a/tests/common.h b/tests/common.h
index 1d6dc85c..7978d7b3 100644
--- a/tests/common.h
+++ b/tests/common.h
@@ -65,27 +65,38 @@ enum result_t {
 extern int32_t NaN;
 extern int64_t NaN64;
 
-typedef union bit64_union_t {
+#include <string.h>
+static inline double sse2neon_tool_recast_f64(uint64_t u64)
+{
     double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+static inline int64_t sse2neon_tool_recast_i64(double f64)
+{
     int64_t i64;
-    uint64_t u64;
-} bit64_union_t;
-typedef union bit32_union_t {
+    memcpy(&i64, &f64, sizeof(int64_t));
+    return i64;
+}
+static inline float sse2neon_tool_recast_f32(uint32_t u32)
+{
     float f32;
-    int32_t i32;
-    uint32_t u32;
-} bit32_union_t;
+    memcpy(&f32, &u32, sizeof(uint32_t));
+    return f32;
+}
+static inline float sse2neon_tool_recast_f32(int32_t i32)
+{
+    float f32;
+    memcpy(&f32, &i32, sizeof(int32_t));
+    return f32;
+}
 static inline float generate_all_ones_float()
 {
-    bit32_union_t u;
-    u.i32 = UINT32_MAX;
-    return u.f32;
+    return sse2neon_tool_recast_f32(UINT32_MAX);
 }
 static inline double generate_all_ones_double()
 {
-    bit64_union_t u;
-    u.i64 = UINT64_MAX;
-    return u.f64;
+    return sse2neon_tool_recast_f64(UINT64_MAX);
 }
 #define ALL_BIT_1_32 generate_all_ones_float()
 #define ALL_BIT_1_64 generate_all_ones_double()
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 242b6660..839a93d3 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -3256,17 +3256,16 @@ result_t test_mm_xor_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int32_t *_a = (const int32_t *) impl.mTestFloatPointer1;
     const int32_t *_b = (const int32_t *) impl.mTestFloatPointer2;
 
-    bit32_union_t d0, d1, d2, d3;
-    d0.i32 = _a[0] ^ _b[0];
-    d1.i32 = _a[1] ^ _b[1];
-    d2.i32 = _a[2] ^ _b[2];
-    d3.i32 = _a[3] ^ _b[3];
+    float d0 = sse2neon_tool_recast_f32(_a[0] ^ _b[0]);
+    float d1 = sse2neon_tool_recast_f32(_a[1] ^ _b[1]);
+    float d2 = sse2neon_tool_recast_f32(_a[2] ^ _b[2]);
+    float d3 = sse2neon_tool_recast_f32(_a[3] ^ _b[3]);
 
     __m128 a = load_m128(_a);
     __m128 b = load_m128(_b);
     __m128 c = _mm_xor_ps(a, b);
 
-    return validateFloat(c, d0.f32, d1.f32, d2.f32, d3.f32);
+    return validateFloat(c, d0, d1, d2, d3);
 }
 
 /* SSE2 */
@@ -3553,15 +3552,14 @@ result_t test_mm_and_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1;
     const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2;
 
-    bit64_union_t d0, d1;
-    d0.i64 = _a[0] & _b[0];
-    d1.i64 = _a[1] & _b[1];
+    double d0 = sse2neon_tool_recast_f64(_a[0] & _b[0]);
+    double d1 = sse2neon_tool_recast_f64(_a[1] & _b[1]);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_and_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_and_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -3833,59 +3831,58 @@ result_t test_mm_cmpeq_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] == _b[0]) ? UINT64_MAX : 0;
-    d1.u64 = (_a[1] == _b[1]) ? UINT64_MAX : 0;
+    
+    double d0 = (_a[0] == _b[0]) ? sse2neon_tool_recast_f64(UINT64_MAX) : 0;
+    double d1 = (_a[1] == _b[1]) ? sse2neon_tool_recast_f64(UINT64_MAX) : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpeq_pd(a, b);
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpeq_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] == _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((const uint64_t *) _a)[1];
+    double d0 = (_a[0] == _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpeq_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpge_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = (_a[1] >= _b[1]) ? ~UINT64_C(0) : 0;
+
+    double d0 = (_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = (_a[1] >= _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpge_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpge_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((uint64_t *) _a)[1];
+    
+    double d0 = (_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpge_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpgt_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -3959,60 +3956,60 @@ result_t test_mm_cmpgt_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = (_a[1] > _b[1]) ? ~UINT64_C(0) : 0;
+    
+    double d0 = (_a[0] > _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = (_a[1] > _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpgt_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpgt_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((uint64_t *) _a)[1];
+    
+    double d0 = (_a[0] > _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpgt_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmple_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = (_a[1] <= _b[1]) ? ~UINT64_C(0) : 0;
+    
+    double d0 = (_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = (_a[1] <= _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmple_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmple_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((uint64_t *) _a)[1];
+    
+    double d0 = (_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmple_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmplt_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -4085,180 +4082,180 @@ result_t test_mm_cmplt_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] < _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
-    d1.u64 = (_a[1] < _b[1]) ? ~UINT64_C(0) : UINT64_C(0);
+    
+    double d0 = (_a[0] < _b[0]) ? ALL_BIT_1_64 : UINT64_C(0);
+    double d1 = (_a[1] < _b[1]) ? ALL_BIT_1_64 : UINT64_C(0);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmplt_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmplt_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((uint64_t *) _a)[1];
+    
+    double d0 = (_a[0] < _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmplt_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpneq_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
-    d1.u64 = (_a[1] != _b[1]) ? ~UINT64_C(0) : UINT64_C(0);
+    
+    double d0 = (_a[0] != _b[0]) ? ALL_BIT_1_64 : UINT64_C(0);
+    double d1 = (_a[1] != _b[1]) ? ALL_BIT_1_64 : UINT64_C(0);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpneq_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpneq_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
-    d1.u64 = ((int64_t *) _a)[1];
+    
+    double d0 = (_a[0] != _b[0]) ? ALL_BIT_1_64 : UINT64_C(0);
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpneq_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnge_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = !(_a[1] >= _b[1]) ? ~UINT64_C(0) : 0;
+    
+    double d0 = !(_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = !(_a[1] >= _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnge_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnge_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((uint64_t *) _a)[1];
+    
+    double d0 = !(_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnge_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpngt_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = !(_a[1] > _b[1]) ? ~UINT64_C(0) : 0;
+    
+    double d0 = !(_a[0] > _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = !(_a[1] > _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpngt_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpngt_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((uint64_t *) _a)[1];
+    
+    double d0 = !(_a[0] > _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpngt_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnle_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = !(_a[1] <= _b[1]) ? ~UINT64_C(0) : 0;
+    
+    double d0 = !(_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = !(_a[1] <= _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnle_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnle_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((uint64_t *) _a)[1];
+    
+    double d0 = !(_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnle_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnlt_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = !(_a[1] < _b[1]) ? ~UINT64_C(0) : 0;
+    
+    double d0 = !(_a[0] < _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = !(_a[1] < _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnlt_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnlt_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    bit64_union_t d0, d1;
-    d0.u64 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
-    d1.u64 = ((uint64_t *) _a)[1];
+    
+    double d0 = !(_a[0] < _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnlt_sd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpord_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -5523,15 +5520,15 @@ result_t test_mm_or_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1;
     const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2;
 
-    bit64_union_t d0, d1;
-    d0.i64 = _a[0] | _b[0];
-    d1.i64 = _a[1] | _b[1];
+    
+    double d0 = sse2neon_tool_recast_f64(_a[0] | _b[0]);
+    double d1 = sse2neon_tool_recast_f64(_a[1] | _b[1]);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_or_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_or_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -7112,15 +7109,15 @@ result_t test_mm_xor_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1;
     const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2;
 
-    bit64_union_t d0, d1;
-    d0.i64 = _a[0] ^ _b[0];
-    d1.i64 = _a[1] ^ _b[1];
+    
+    double d0 = sse2neon_tool_recast_f64(_a[0] ^ _b[0]);
+    double d1 = sse2neon_tool_recast_f64(_a[1] ^ _b[1]);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_xor_pd(a, b);
 
-    return validateDouble(c, d0.f64, d1.f64);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_xor_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -8115,9 +8112,8 @@ result_t test_mm_blendv_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     for (int i = 0; i < 2; i++) {
         // signed shift right would return a result which is either all 1's from
         // negative numbers or all 0's from positive numbers
-        bit64_union_t m;
-        m.f64 = _mask[i];
-        if (m.i64 >> 63) {
+        int64_t m = sse2neon_tool_recast_i64(_mask[i]);
+        if (m >> 63) {
             _c[i] = _b[i];
         } else {
             _c[i] = _a[i];