From 09b21b859f8603359e69ae9142a470532c9d35bd Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Fri, 28 Jun 2024 22:52:57 +0800
Subject: [PATCH] fix: Fix strict-aliasing errors

---
 sse2neon.h     | 562 ++++++++++++++++++++++++++++++-------------------
 tests/common.h |  40 +++-
 tests/impl.cpp | 211 +++++++++----------
 3 files changed, 486 insertions(+), 327 deletions(-)

diff --git a/sse2neon.h b/sse2neon.h
index 2b12721b..45da43ff 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -130,6 +130,20 @@
 
 #include <stdint.h>
 #include <stdlib.h>
+#include <string.h>
+
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
+}
 
 #if defined(_WIN32)
 /* Definitions for _mm_{malloc,free} are provided by <malloc.h>
@@ -2444,7 +2458,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
 // _MM_ROUND_TOWARD_ZERO
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding)
 {
     union {
         fpcr_bitfield field;
@@ -2981,11 +2995,17 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
+    c[0] = a0 + b0;
+    c[1] = a1 + b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -2999,11 +3019,13 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_add_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1];
+    c[0] = a0 + b0;
+    c[1] = a1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -3257,13 +3279,17 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3279,11 +3305,12 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3326,13 +3353,17 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3348,11 +3379,12 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3368,13 +3400,17 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3390,11 +3426,12 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3440,13 +3477,17 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3461,11 +3502,12 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3508,15 +3550,17 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3541,15 +3585,17 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3574,15 +3620,17 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3607,15 +3655,17 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3643,19 +3693,17 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3670,14 +3718,12 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3698,19 +3744,17 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_s32(
         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3725,14 +3769,12 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
     d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
@@ -3747,10 +3789,10 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 >= b0;
 #endif
 }
 
@@ -3762,10 +3804,11 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 > *(double *) &b0);
+    return a0 > b0;
 #endif
 }
 
@@ -3777,10 +3820,11 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 <= *(double *) &b0);
+    return a0 <= b0;
 #endif
 }
 
@@ -3792,10 +3836,11 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (*(double *) &a0 < *(double *) &b0);
+    return a0 < b0;
 #endif
 }
 
@@ -3864,8 +3909,11 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
         vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
     return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
 #endif
 }
@@ -3876,8 +3924,11 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
 FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
 {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double d0 = ((double *) &rnd)[0];
-    double d1 = ((double *) &rnd)[1];
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
@@ -3892,9 +3943,10 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_ps(0, 0, (float) a1, (float) a0);
 #endif
 }
 
@@ -3993,7 +4045,9 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
 #else
-    return ((double *) &a)[0];
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _a;
 #endif
 }
 
@@ -4006,7 +4060,8 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
     return (int32_t) ret;
 #endif
 }
@@ -4020,7 +4075,8 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
     return (int64_t) ret;
 #endif
 }
@@ -4042,8 +4098,10 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
 #else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0));
 #endif
 }
 
@@ -4075,9 +4133,9 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4103,9 +4161,9 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4140,8 +4198,8 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
-    return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+    return vreinterpretq_m128d_s64(vsetq_lane_s64(
+        sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4150,18 +4208,20 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
     return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
 }
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers with truncation, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
-FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
@@ -4179,8 +4239,9 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 {
-    double ret = *((double *) &a);
-    return (int32_t) ret;
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
@@ -4191,8 +4252,9 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int64_t) _a;
 #endif
 }
 
@@ -4210,11 +4272,17 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
+    c[0] = a0 / b0;
+    c[1] = a1 / b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4459,15 +4527,19 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
 
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
 #endif
 }
 
@@ -4480,9 +4552,11 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_max_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 > b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4520,14 +4594,18 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
-    uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
-    return vreinterpretq_m128d_u64(vld1q_u64(d));
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
 #endif
 }
 
@@ -4540,9 +4618,11 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_min_pd(a, b));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 < b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4697,11 +4777,17 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
+    c[0] = a0 * b0;
+    c[1] = a1 * b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4991,7 +5077,8 @@ FORCE_INLINE __m128d _mm_set1_pd(double d)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+    int64_t _d = sse2neon_recast_f64_s64(d);
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
 #endif
 }
 
@@ -5282,9 +5369,12 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double _a0 = sqrt(a0);
+    double _a1 = sqrt(a1);
+    return _mm_set_pd(_a1, _a0);
 #endif
 }
 
@@ -5297,7 +5387,10 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+    double _a, _b;
+    _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return _mm_set_pd(_a, sqrt(_b));
 #endif
 }
 
@@ -5652,11 +5745,17 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = da[0] - db[0];
-    c[1] = da[1] - db[1];
+    c[0] = a0 - b0;
+    c[1] = a1 - b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -5960,9 +6059,15 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[] = {da[0] + da[1], db[0] + db[1]};
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 + a1, b0 + b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -5988,17 +6093,23 @@ FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 // Horizontally subtract adjacent pairs of double-precision (64-bit)
 // floating-point elements in a and b, and pack the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
-FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
 {
 #if defined(__aarch64__) || defined(_M_ARM64)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
     return vreinterpretq_m128d_f64(
-        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
-#else
-    double *da = (double *) &_a;
-    double *db = (double *) &_b;
-    double c[] = {da[0] - da[1], db[0] - db[1]};
+        vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 - a1, b0 - b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -6794,8 +6905,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double *f = (double *) &a;
-    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(ceil(a1), ceil(a0));
 #endif
 }
 
@@ -7003,8 +7116,16 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
                              : 0;
 #else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double d0 = (imm & 0x10) ? a0 * b0 : 0;
+    double d1 = (imm & 0x20) ? a1 * b1 : 0;
 #endif
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
@@ -7012,7 +7133,11 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 #if defined(__aarch64__) || defined(_M_ARM64)
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+    double _tmp0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
+    double _tmp1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
+    double sum = _tmp0 + _tmp1;
 #endif
     // Conditionally store the sum
     const __m128d sumMask =
@@ -7102,8 +7227,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double *f = (double *) &a;
-    return _mm_set_pd(floor(f[1]), floor(f[0]));
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(floor(a1), floor(a0));
 #endif
 }
 
@@ -9219,7 +9346,8 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 #endif
 }
 
-FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode(
+    unsigned int flag)
 {
     // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
     // regardless of the value of the FZ bit.
diff --git a/tests/common.h b/tests/common.h
index 163d4e68..d578e065 100644
--- a/tests/common.h
+++ b/tests/common.h
@@ -64,8 +64,44 @@ enum result_t {
 };
 extern int32_t NaN;
 extern int64_t NaN64;
-#define ALL_BIT_1_32 (*(float *) &NaN)
-#define ALL_BIT_1_64 (*(double *) &NaN64)
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma push_macro("OPTNONE")
+#define OPTNONE __attribute__((optimize("O0")))
+#elif defined(__clang__)
+#pragma push_macro("OPTNONE")
+#define OPTNONE __attribute__((optnone))
+#else
+#define OPTNONE
+#endif
+
+#include <string.h>
+static inline double sse2neon_tool_recast_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+static inline int64_t sse2neon_tool_recast_i64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(int64_t));
+    return i64;
+}
+static inline float sse2neon_tool_recast_f32(uint32_t u32)
+{
+    float f32;
+    memcpy(&f32, &u32, sizeof(uint32_t));
+    return f32;
+}
+static inline float sse2neon_tool_recast_f32(int32_t i32)
+{
+    float f32;
+    memcpy(&f32, &i32, sizeof(int32_t));
+    return f32;
+}
+#define ALL_BIT_1_32 sse2neon_tool_recast_f32(UINT32_MAX)
+#define ALL_BIT_1_64 sse2neon_tool_recast_f64(UINT64_MAX)
 
 template <typename T>
 result_t validate128(T a, T b)
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 74330f5c..ecd4635f 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -431,7 +431,7 @@ template <class T>
 __m128i load_m128i(const T *p)
 {
     __m128 a = _mm_loadu_ps((const float *) p);
-    __m128i ia = *(const __m128i *) &a;
+    __m128i ia = _mm_castps_si128(a);
     return ia;
 }
 
@@ -461,7 +461,7 @@ result_t do_mm_store_ps(float *p, float x, float y, float z, float w)
 result_t do_mm_store_ps(int32_t *p, int32_t x, int32_t y, int32_t z, int32_t w)
 {
     __m128i a = _mm_set_epi32(x, y, z, w);
-    _mm_store_ps((float *) p, *(const __m128 *) &a);
+    _mm_store_ps((float *) p, _mm_castsi128_ps(a));
     ASSERT_RETURN(p[0] == w);
     ASSERT_RETURN(p[1] == z);
     ASSERT_RETURN(p[2] == y);
@@ -850,7 +850,7 @@ result_t test_mm_and_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     r[2] = ia[2] & ib[2];
     r[3] = ia[3] & ib[3];
     __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
-    result_t res = VALIDATE_INT32_M128(*(const __m128i *) &c, r);
+    result_t res = VALIDATE_INT32_M128(_mm_castps_si128(c), r);
     if (res) {
         res = VALIDATE_INT32_M128(ret, r);
     }
@@ -879,7 +879,7 @@ result_t test_mm_andnot_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     r[3] = ~ia[3] & ib[3];
     __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
     result_t res = TEST_FAIL;
-    res = VALIDATE_INT32_M128(*(const __m128i *) &c, r);
+    res = VALIDATE_INT32_M128(_mm_castps_si128(c), r);
     if (res) {
         res = VALIDATE_INT32_M128(ret, r);
     }
@@ -938,7 +938,7 @@ result_t test_mm_cmpeq_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     result[3] = _a[3] == _b[3] ? -1 : 0;
 
     __m128 ret = _mm_cmpeq_ps(a, b);
-    __m128i iret = *(const __m128i *) &ret;
+    __m128i iret = _mm_castps_si128(ret);
     return VALIDATE_INT32_M128(iret, result);
 }
 
@@ -973,7 +973,7 @@ result_t test_mm_cmpge_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     result[3] = _a[3] >= _b[3] ? -1 : 0;
 
     __m128 ret = _mm_cmpge_ps(a, b);
-    __m128i iret = *(const __m128i *) &ret;
+    __m128i iret = _mm_castps_si128(ret);
     return VALIDATE_INT32_M128(iret, result);
 }
 
@@ -1008,7 +1008,7 @@ result_t test_mm_cmpgt_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     result[3] = _a[3] > _b[3] ? -1 : 0;
 
     __m128 ret = _mm_cmpgt_ps(a, b);
-    __m128i iret = *(const __m128i *) &ret;
+    __m128i iret = _mm_castps_si128(ret);
     return VALIDATE_INT32_M128(iret, result);
 }
 
@@ -1043,7 +1043,7 @@ result_t test_mm_cmple_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     result[3] = _a[3] <= _b[3] ? -1 : 0;
 
     __m128 ret = _mm_cmple_ps(a, b);
-    __m128i iret = *(const __m128i *) &ret;
+    __m128i iret = _mm_castps_si128(ret);
     return VALIDATE_INT32_M128(iret, result);
 }
 
@@ -1078,7 +1078,7 @@ result_t test_mm_cmplt_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     result[3] = _a[3] < _b[3] ? -1 : 0;
 
     __m128 ret = _mm_cmplt_ps(a, b);
-    __m128i iret = *(const __m128i *) &ret;
+    __m128i iret = _mm_castps_si128(ret);
     return VALIDATE_INT32_M128(iret, result);
 }
 
@@ -1114,7 +1114,7 @@ result_t test_mm_cmpneq_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     result[3] = _a[3] != _b[3] ? -1 : 0;
 
     __m128 ret = _mm_cmpneq_ps(a, b);
-    __m128i iret = *(const __m128i *) &ret;
+    __m128i iret = _mm_castps_si128(ret);
     return VALIDATE_INT32_M128(iret, result);
 }
 
@@ -2519,7 +2519,7 @@ result_t test_mm_or_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     r[2] = ia[2] | ib[2];
     r[3] = ia[3] | ib[3];
     __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
-    result_t res = VALIDATE_INT32_M128(*(const __m128i *) &c, r);
+    result_t res = VALIDATE_INT32_M128(_mm_castps_si128(c), r);
     if (res) {
         res = VALIDATE_INT32_M128(ret, r);
     }
@@ -2751,7 +2751,8 @@ result_t test_mm_set_ps1(const SSE2NEONTestImpl &impl, uint32_t iter)
     return validateFloat(ret, a, a, a, a);
 }
 
-result_t test_mm_set_rounding_mode(const SSE2NEONTestImpl &impl, uint32_t iter)
+OPTNONE result_t test_mm_set_rounding_mode(const SSE2NEONTestImpl &impl,
+                                           uint32_t iter)
 {
     const float *_a = impl.mTestFloatPointer1;
     result_t res_toward_zero, res_to_neg_inf, res_to_pos_inf, res_nearest;
@@ -2980,7 +2981,7 @@ result_t test_mm_store_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     int32_t z = impl.mTestInts[iter + 2];
     int32_t w = impl.mTestInts[iter + 3];
     __m128i a = _mm_set_epi32(x, y, z, w);
-    _mm_store_ps((float *) p, *(const __m128 *) &a);
+    _mm_store_ps((float *) p, _mm_castsi128_ps(a));
     ASSERT_RETURN(p[0] == w);
     ASSERT_RETURN(p[1] == z);
     ASSERT_RETURN(p[2] == y);
@@ -3255,18 +3256,16 @@ result_t test_mm_xor_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const int32_t *_a = (const int32_t *) impl.mTestFloatPointer1;
     const int32_t *_b = (const int32_t *) impl.mTestFloatPointer2;
-
-    int32_t d0 = _a[0] ^ _b[0];
-    int32_t d1 = _a[1] ^ _b[1];
-    int32_t d2 = _a[2] ^ _b[2];
-    int32_t d3 = _a[3] ^ _b[3];
+    float d0 = sse2neon_tool_recast_f32(_a[0] ^ _b[0]);
+    float d1 = sse2neon_tool_recast_f32(_a[1] ^ _b[1]);
+    float d2 = sse2neon_tool_recast_f32(_a[2] ^ _b[2]);
+    float d3 = sse2neon_tool_recast_f32(_a[3] ^ _b[3]);
 
     __m128 a = load_m128(_a);
     __m128 b = load_m128(_b);
     __m128 c = _mm_xor_ps(a, b);
 
-    return validateFloat(c, *((float *) &d0), *((float *) &d1),
-                         *((float *) &d2), *((float *) &d3));
+    return validateFloat(c, d0, d1, d2, d3);
 }
 
 /* SSE2 */
@@ -3552,15 +3551,14 @@ result_t test_mm_and_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1;
     const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2;
-
-    int64_t d0 = _a[0] & _b[0];
-    int64_t d1 = _a[1] & _b[1];
+    double d0 = sse2neon_tool_recast_f64(_a[0] & _b[0]);
+    double d1 = sse2neon_tool_recast_f64(_a[1] & _b[1]);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_and_pd(a, b);
 
-    return validateDouble(c, *((double *) &d0), *((double *) &d1));
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_and_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -3569,8 +3567,8 @@ result_t test_mm_and_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int32_t *_b = impl.mTestIntPointer2;
     __m128i a = load_m128i(_a);
     __m128i b = load_m128i(_b);
-    __m128 fc = _mm_and_ps(*(const __m128 *) &a, *(const __m128 *) &b);
-    __m128i c = *(const __m128i *) &fc;
+    __m128 fc = _mm_and_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b));
+    __m128i c = _mm_castps_si128(fc);
     // now for the assertion...
     const uint32_t *ia = (const uint32_t *) &a;
     const uint32_t *ib = (const uint32_t *) &b;
@@ -3603,7 +3601,7 @@ result_t test_mm_andnot_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     const uint64_t *ib = (const uint64_t *) &b;
     uint64_t r0 = ~ia[0] & ib[0];
     uint64_t r1 = ~ia[1] & ib[1];
-    return validateUInt64(*(const __m128i *) &c, r0, r1);
+    return validateUInt64(_mm_castpd_si128(c), r0, r1);
 }
 
 result_t test_mm_andnot_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -3612,8 +3610,8 @@ result_t test_mm_andnot_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int32_t *_b = impl.mTestIntPointer2;
     __m128i a = load_m128i(_a);
     __m128i b = load_m128i(_b);
-    __m128 fc = _mm_andnot_ps(*(const __m128 *) &a, *(const __m128 *) &b);
-    __m128i c = *(const __m128i *) &fc;
+    __m128 fc = _mm_andnot_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b));
+    __m128i c = _mm_castps_si128(fc);
     // now for the assertion...
     const uint32_t *ia = (const uint32_t *) &a;
     const uint32_t *ib = (const uint32_t *) &b;
@@ -3832,55 +3830,55 @@ result_t test_mm_cmpeq_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    uint64_t d0 = (_a[0] == _b[0]) ? 0xffffffffffffffff : 0;
-    uint64_t d1 = (_a[1] == _b[1]) ? 0xffffffffffffffff : 0;
+    double d0 = (_a[0] == _b[0]) ? sse2neon_tool_recast_f64(UINT64_MAX) : 0;
+    double d1 = (_a[1] == _b[1]) ? sse2neon_tool_recast_f64(UINT64_MAX) : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpeq_pd(a, b);
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpeq_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    const uint64_t d0 = (_a[0] == _b[0]) ? ~UINT64_C(0) : 0;
-    const uint64_t d1 = ((const uint64_t *) _a)[1];
+    double d0 = (_a[0] == _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpeq_sd(a, b);
 
-    return validateDouble(c, *(const double *) &d0, *(const double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpge_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    uint64_t d0 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = (_a[1] >= _b[1]) ? ~UINT64_C(0) : 0;
+    double d0 = (_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = (_a[1] >= _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpge_pd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpge_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    uint64_t d0 = (_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = ((uint64_t *) _a)[1];
+    double d0 = (_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpge_sd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpgt_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -3954,56 +3952,56 @@ result_t test_mm_cmpgt_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    uint64_t d0 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = (_a[1] > _b[1]) ? ~UINT64_C(0) : 0;
+    double d0 = (_a[0] > _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = (_a[1] > _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpgt_pd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpgt_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    uint64_t d0 = (_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = ((uint64_t *) _a)[1];
+    double d0 = (_a[0] > _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpgt_sd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmple_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    uint64_t d0 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = (_a[1] <= _b[1]) ? ~UINT64_C(0) : 0;
+    double d0 = (_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = (_a[1] <= _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmple_pd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmple_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    uint64_t d0 = (_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = ((uint64_t *) _a)[1];
+    double d0 = (_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmple_sd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmplt_epi16(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -4076,171 +4074,168 @@ result_t test_mm_cmplt_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-
-    int64_t f0 = (_a[0] < _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
-    int64_t f1 = (_a[1] < _b[1]) ? ~UINT64_C(0) : UINT64_C(0);
+    double d0 = (_a[0] < _b[0]) ? ALL_BIT_1_64 : UINT64_C(0);
+    double d1 = (_a[1] < _b[1]) ? ALL_BIT_1_64 : UINT64_C(0);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmplt_pd(a, b);
 
-    return validateDouble(c, *(double *) &f0, *(double *) &f1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmplt_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    uint64_t d0 = (_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = ((uint64_t *) _a)[1];
+    double d0 = (_a[0] < _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmplt_sd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpneq_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-
-    int64_t f0 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
-    int64_t f1 = (_a[1] != _b[1]) ? ~UINT64_C(0) : UINT64_C(0);
+    double d0 = (_a[0] != _b[0]) ? ALL_BIT_1_64 : UINT64_C(0);
+    double d1 = (_a[1] != _b[1]) ? ALL_BIT_1_64 : UINT64_C(0);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpneq_pd(a, b);
 
-    return validateDouble(c, *(double *) &f0, *(double *) &f1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpneq_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-
-    int64_t f0 = (_a[0] != _b[0]) ? ~UINT64_C(0) : UINT64_C(0);
-    int64_t f1 = ((int64_t *) _a)[1];
+    double d0 = (_a[0] != _b[0]) ? ALL_BIT_1_64 : UINT64_C(0);
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpneq_sd(a, b);
 
-    return validateDouble(c, *(double *) &f0, *(double *) &f1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnge_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    uint64_t d0 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = !(_a[1] >= _b[1]) ? ~UINT64_C(0) : 0;
+    double d0 = !(_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = !(_a[1] >= _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnge_pd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnge_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    uint64_t d0 = !(_a[0] >= _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = ((uint64_t *) _a)[1];
+    double d0 = !(_a[0] >= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnge_sd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpngt_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    uint64_t d0 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = !(_a[1] > _b[1]) ? ~UINT64_C(0) : 0;
+    double d0 = !(_a[0] > _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = !(_a[1] > _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpngt_pd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpngt_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    uint64_t d0 = !(_a[0] > _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = ((uint64_t *) _a)[1];
+    double d0 = !(_a[0] > _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpngt_sd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnle_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    uint64_t d0 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = !(_a[1] <= _b[1]) ? ~UINT64_C(0) : 0;
+    double d0 = !(_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = !(_a[1] <= _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnle_pd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnle_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    uint64_t d0 = !(_a[0] <= _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = ((uint64_t *) _a)[1];
+    double d0 = !(_a[0] <= _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnle_sd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnlt_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     const double *_b = (const double *) impl.mTestFloatPointer2;
-    uint64_t d0 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = !(_a[1] < _b[1]) ? ~UINT64_C(0) : 0;
+    double d0 = !(_a[0] < _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = !(_a[1] < _b[1]) ? ALL_BIT_1_64 : 0;
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnlt_pd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpnlt_sd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     double *_a = (double *) impl.mTestFloatPointer1;
     double *_b = (double *) impl.mTestFloatPointer2;
-    uint64_t d0 = !(_a[0] < _b[0]) ? ~UINT64_C(0) : 0;
-    uint64_t d1 = ((uint64_t *) _a)[1];
+    double d0 = !(_a[0] < _b[0]) ? ALL_BIT_1_64 : 0;
+    double d1 = _a[1];
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_cmpnlt_sd(a, b);
 
-    return validateDouble(c, *(double *) &d0, *(double *) &d1);
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_cmpord_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -4444,7 +4439,8 @@ result_t test_mm_cvtepi32_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     return validateFloat(ret, trun[0], trun[1], trun[2], trun[3]);
 }
 
-result_t test_mm_cvtpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
+OPTNONE result_t test_mm_cvtpd_epi32(const SSE2NEONTestImpl &impl,
+                                     uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     int32_t d[2] = {};
@@ -5504,15 +5500,14 @@ result_t test_mm_or_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1;
     const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2;
-
-    int64_t d0 = _a[0] | _b[0];
-    int64_t d1 = _a[1] | _b[1];
+    double d0 = sse2neon_tool_recast_f64(_a[0] | _b[0]);
+    double d1 = sse2neon_tool_recast_f64(_a[1] | _b[1]);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_or_pd(a, b);
 
-    return validateDouble(c, *((double *) &d0), *((double *) &d1));
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_or_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -5521,8 +5516,8 @@ result_t test_mm_or_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int32_t *_b = impl.mTestIntPointer2;
     __m128i a = load_m128i(_a);
     __m128i b = load_m128i(_b);
-    __m128 fc = _mm_or_ps(*(const __m128 *) &a, *(const __m128 *) &b);
-    __m128i c = *(const __m128i *) &fc;
+    __m128 fc = _mm_or_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b));
+    __m128i c = _mm_castps_si128(fc);
     // now for the assertion...
     const uint32_t *ia = (const uint32_t *) &a;
     const uint32_t *ib = (const uint32_t *) &b;
@@ -7092,15 +7087,14 @@ result_t test_mm_xor_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const int64_t *_a = (const int64_t *) impl.mTestFloatPointer1;
     const int64_t *_b = (const int64_t *) impl.mTestFloatPointer2;
-
-    int64_t d0 = _a[0] ^ _b[0];
-    int64_t d1 = _a[1] ^ _b[1];
+    double d0 = sse2neon_tool_recast_f64(_a[0] ^ _b[0]);
+    double d1 = sse2neon_tool_recast_f64(_a[1] ^ _b[1]);
 
     __m128d a = load_m128d(_a);
     __m128d b = load_m128d(_b);
     __m128d c = _mm_xor_pd(a, b);
 
-    return validateDouble(c, *((double *) &d0), *((double *) &d1));
+    return validateDouble(c, d0, d1);
 }
 
 result_t test_mm_xor_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
@@ -8095,7 +8089,8 @@ result_t test_mm_blendv_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     for (int i = 0; i < 2; i++) {
         // signed shift right would return a result which is either all 1's from
         // negative numbers or all 0's from positive numbers
-        if ((*(const int64_t *) (_mask + i)) >> 63) {
+        int64_t m = sse2neon_tool_recast_i64(_mask[i]);
+        if (m >> 63) {
             _c[i] = _b[i];
         } else {
             _c[i] = _a[i];
@@ -8425,7 +8420,7 @@ result_t test_mm_cvtepu8_epi64(const SSE2NEONTestImpl &impl, uint32_t iter)
     MM_DP_PD_TEST_CASE_WITH(0x22);   \
     MM_DP_PD_TEST_CASE_WITH(0x23);
 
-result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
+OPTNONE result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     GENERATE_MM_DP_PD_TEST_CASES
     return TEST_SUCCESS;
@@ -8460,7 +8455,7 @@ result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     MM_DP_PS_TEST_CASE_WITH(0x23);   \
     MM_DP_PS_TEST_CASE_WITH(0xB5);
 
-result_t test_mm_dp_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
+OPTNONE result_t test_mm_dp_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     GENERATE_MM_DP_PS_TEST_CASES
     return TEST_SUCCESS;
@@ -11819,8 +11814,8 @@ result_t test_mm_popcnt_u64(const SSE2NEONTestImpl &impl, uint32_t iter)
     return TEST_SUCCESS;
 }
 
-result_t test_mm_set_denormals_zero_mode(const SSE2NEONTestImpl &impl,
-                                         uint32_t iter)
+OPTNONE result_t test_mm_set_denormals_zero_mode(const SSE2NEONTestImpl &impl,
+                                                 uint32_t iter)
 {
     result_t res_set_denormals_zero_on, res_set_denormals_zero_off;
     float factor = 2;