diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index cd11423e..a143629c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -56,10 +56,10 @@ jobs:
     strategy:
       matrix:
         arch_with_features: [
-          {arch: armv7, feature: none, arch_cflags: none},
-          {arch: aarch64, feature: none, arch_cflags: none},
-          {arch: aarch64, feature: crypto+crc, arch_cflags: none},
-          {arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8'}
+          # {arch: armv7, feature: none, arch_cflags: none},
+          {arch: aarch64, feature: none, arch_cflags: -O3},
+          # {arch: aarch64, feature: crypto+crc, arch_cflags: none},
+          # {arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8'}
         ]
         cxx_compiler: [g++-10, clang++-11]
     steps:
@@ -80,9 +80,7 @@ jobs:
             apt-get install -q -y "${{ matrix.cxx_compiler }}" make
             apt-get install -q -y gcc
           run: |
-            make FEATURE="${{ matrix.arch_with_features.feature }} -O0" check
-            make FEATURE="${{ matrix.arch_with_features.feature }} -O1" check
-            make FEATURE="${{ matrix.arch_with_features.feature }} -O2" check
+            make FEATURE=${{ matrix.arch_with_features.feature }} check
 
   host-win-msvc:
     runs-on: windows-2022
diff --git a/sse2neon.h b/sse2neon.h
index 8ecfc753..0791cfbd 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -566,6 +566,17 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
 
+typedef union bit64_union_t {
+    double f64;
+    int64_t i64;
+    uint64_t u64;
+} bit64_union_t;
+typedef union bit32_union_t {
+    float f32;
+    int32_t i32;
+    uint32_t u32;
+} bit32_union_t;
+
 // Function declaration
 // SSE
 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
@@ -3257,13 +3268,14 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.f64 >= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3279,12 +3291,13 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
+    d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3326,13 +3339,14 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.f64 > b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3348,12 +3362,13 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
+    d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3368,13 +3383,14 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.f64 <= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3390,12 +3406,13 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
+    d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3440,13 +3457,14 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.f64 < b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3461,12 +3479,13 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1;
+    d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3508,15 +3527,14 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0.f64 >= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1.f64 >= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3541,15 +3559,14 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0.f64 > b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1.f64 > b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3574,15 +3591,14 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0.f64 <= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1.f64 <= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3607,15 +3623,14 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] =
-        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] =
-        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0.f64 < b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1.f64 < b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3643,19 +3658,14 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
+    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1.f64 == a1.f64 && b1.f64 == b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3670,15 +3680,13 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? ~UINT64_C(0)
-               : UINT64_C(0);
-    d[1] = a1;
+    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3698,19 +3706,14 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_s32(
         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
-            (*(double *) &b1) == (*(double *) &b1))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
+    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1.f64 == a1.f64 && b1.f64 == b1.f64) ? UINT64_C(0) : ~UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3725,15 +3728,13 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    bit64_union_t a0, a1, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
     uint64_t d[2];
-    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
-            (*(double *) &b0) == (*(double *) &b0))
-               ? UINT64_C(0)
-               : ~UINT64_C(0);
-    d[1] = a1;
+    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = a1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3747,10 +3748,10 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-
-    return (*(double *) &a0 >= *(double *) &b0);
+    bit64_union_t a0, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    return a0.f64 >= b0.f64;
 #endif
 }
 
@@ -3762,10 +3763,11 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
 
-    return (*(double *) &a0 > *(double *) &b0);
+    return (a0.f64 > b0.f64);
 #endif
 }
 
@@ -3777,10 +3779,11 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
 
-    return (*(double *) &a0 <= *(double *) &b0);
+    return (a0.f64 <= b0.f64);
 #endif
 }
 
@@ -3792,10 +3795,11 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, b0;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
 
-    return (*(double *) &a0 < *(double *) &b0);
+    return (a0.f64 < b0.f64);
 #endif
 }
 
@@ -3892,9 +3896,10 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
+    bit64_union_t a0, a1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    return _mm_set_ps(0, 0, (float) a1.f64, (float) a0.f64);
 #endif
 }
 
@@ -3993,7 +3998,9 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
 #else
-    return ((double *) &a)[0];
+    bit64_union_t _a;
+    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    return _a.f64;
 #endif
 }
 
@@ -4006,8 +4013,9 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int32_t) ret;
+    bit64_union_t ret;
+    ret.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
+    return (int32_t) ret.f64;
 #endif
 }
 
@@ -4020,8 +4028,9 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    double ret = ((double *) &rnd)[0];
-    return (int64_t) ret;
+    bit64_union_t ret;
+    ret.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
+    return (int64_t) ret.f64;
 #endif
 }
 
@@ -4042,8 +4051,10 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
 #else
-    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
-                                                 vreinterpretq_f32_m128(a), 0));
+    bit64_union_t b0;
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b0.f64, vreinterpretq_f32_m128(a), 0));
 #endif
 }
 
@@ -4075,9 +4086,10 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    bit64_union_t bf;
+    bf.f64 = (double) b;
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4103,9 +4115,10 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    double bf = (double) b;
+    bit64_union_t bf;
+    bf.f64 = (double) b;
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4135,13 +4148,14 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
-    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    bit64_union_t d;
+    d.f64 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+        vsetq_lane_f64(d.f64, vreinterpretq_f64_m128d(a), 0));
 #else
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(d.i64, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4150,9 +4164,10 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+    bit64_union_t a0, a1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    return _mm_set_epi32(0, 0, (int32_t) a1.f64, (int32_t) a0.f64);
 }
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
@@ -4160,9 +4175,10 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
-    double a0 = ((double *) &a)[0];
-    double a1 = ((double *) &a)[1];
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    bit64_union_t a0, a1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0.f64, (int32_t) a1.f64};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
 
@@ -4179,8 +4195,9 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 {
-    double ret = *((double *) &a);
-    return (int32_t) ret;
+    bit64_union_t _a;
+    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    return (int32_t) _a.f64;
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
@@ -4191,8 +4208,9 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
+    bit64_union_t _a;
+    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    return (int64_t) _a.f64;
 #endif
 }
 
@@ -4210,11 +4228,14 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     double c[2];
-    c[0] = da[0] / db[0];
-    c[1] = da[1] / db[1];
+    c[0] = a0.f64 / b0.f64;
+    c[1] = a1.f64 / b1.f64;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4459,13 +4480,14 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+    d[0] = a0.f64 > b0.f64 ? a0.u64 : b0.u64;
+    d[1] = a1.f64 > b1.f64 ? a1.u64 : b1.u64;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -4520,13 +4542,14 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
-    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
-    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
-    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
     uint64_t d[2];
-    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
-    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    d[0] = a0.f64 < b0.f64 ? a0.u64 : b0.u64;
+    d[1] = a1.f64 < b1.f64 ? a1.u64 : b1.u64;
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
@@ -4991,7 +5014,9 @@ FORCE_INLINE __m128d _mm_set1_pd(double d)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+    bit64_union_t _d;
+    _d.f64 = d;
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d.i64));
 #endif
 }
 
@@ -5282,9 +5307,12 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    double a0 = sqrt(((double *) &a)[0]);
-    double a1 = sqrt(((double *) &a)[1]);
-    return _mm_set_pd(a1, a0);
+    bit64_union_t a0, a1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    double _a0 = sqrt(a0.f64);
+    double _a1 = sqrt(a1.f64);
+    return _mm_set_pd(_a1, _a0);
 #endif
 }
 
@@ -5297,7 +5325,10 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
-    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+    bit64_union_t _a, _b;
+    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    _b.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    return _mm_set_pd(_a.f64, sqrt(_b.f64));
 #endif
 }
 
@@ -7003,8 +7034,13 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
                              : 0;
 #else
-    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
-    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+    bit64_union_t a0, a1, b0, b1;
+    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
+    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double d0 = (imm & 0x10) ? a0.f64 * b0.f64 : 0;
+    double d1 = (imm & 0x20) ? a1.f64 * b1.f64 : 0;
 #endif
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
@@ -7012,7 +7048,10 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 #if defined(__aarch64__) || defined(_M_ARM64)
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
-    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+    bit64_union_t _tmp0, _tmp1;
+    _tmp0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0);
+    _tmp1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1);
+    double sum = _tmp0.f64 + _tmp1.f64;
 #endif
     // Conditionally store the sum
     const __m128d sumMask =
diff --git a/tests/common.h b/tests/common.h
index 163d4e68..668f10ae 100644
--- a/tests/common.h
+++ b/tests/common.h
@@ -64,8 +64,26 @@ enum result_t {
 };
 extern int32_t NaN;
 extern int64_t NaN64;
-#define ALL_BIT_1_32 (*(float *) &NaN)
-#define ALL_BIT_1_64 (*(double *) &NaN64)
+static inline float generate_all_ones_float()
+{
+    union {
+        uint32_t i;
+        float f;
+    } u;
+    u.i = UINT32_MAX;
+    return u.f;
+}
+static inline double generate_all_ones_double()
+{
+    union {
+        uint64_t i;
+        double f;
+    } u;
+    u.i = UINT64_MAX;
+    return u.f;
+}
+#define ALL_BIT_1_32 generate_all_ones_float()
+#define ALL_BIT_1_64 generate_all_ones_double()
 
 template <typename T>
 result_t validate128(T a, T b)
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 74330f5c..11e8fa05 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -461,7 +461,7 @@ result_t do_mm_store_ps(float *p, float x, float y, float z, float w)
 result_t do_mm_store_ps(int32_t *p, int32_t x, int32_t y, int32_t z, int32_t w)
 {
     __m128i a = _mm_set_epi32(x, y, z, w);
-    _mm_store_ps((float *) p, *(const __m128 *) &a);
+    _mm_store_ps((float *) p, _mm_castsi128_ps(a));
     ASSERT_RETURN(p[0] == w);
     ASSERT_RETURN(p[1] == z);
     ASSERT_RETURN(p[2] == y);
@@ -2519,7 +2519,7 @@ result_t test_mm_or_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     r[2] = ia[2] | ib[2];
     r[3] = ia[3] | ib[3];
     __m128i ret = do_mm_set_epi32(r[3], r[2], r[1], r[0]);
-    result_t res = VALIDATE_INT32_M128(*(const __m128i *) &c, r);
+    result_t res = VALIDATE_INT32_M128(_mm_castps_si128(c), r);
     if (res) {
         res = VALIDATE_INT32_M128(ret, r);
     }
@@ -2980,7 +2980,7 @@ result_t test_mm_store_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     int32_t z = impl.mTestInts[iter + 2];
     int32_t w = impl.mTestInts[iter + 3];
     __m128i a = _mm_set_epi32(x, y, z, w);
-    _mm_store_ps((float *) p, *(const __m128 *) &a);
+    _mm_store_ps((float *) p, _mm_castsi128_ps(a));
     ASSERT_RETURN(p[0] == w);
     ASSERT_RETURN(p[1] == z);
     ASSERT_RETURN(p[2] == y);
@@ -3569,7 +3569,7 @@ result_t test_mm_and_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int32_t *_b = impl.mTestIntPointer2;
     __m128i a = load_m128i(_a);
     __m128i b = load_m128i(_b);
-    __m128 fc = _mm_and_ps(*(const __m128 *) &a, *(const __m128 *) &b);
+    __m128 fc = _mm_and_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b));
     __m128i c = *(const __m128i *) &fc;
     // now for the assertion...
     const uint32_t *ia = (const uint32_t *) &a;
@@ -3612,7 +3612,7 @@ result_t test_mm_andnot_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int32_t *_b = impl.mTestIntPointer2;
     __m128i a = load_m128i(_a);
     __m128i b = load_m128i(_b);
-    __m128 fc = _mm_andnot_ps(*(const __m128 *) &a, *(const __m128 *) &b);
+    __m128 fc = _mm_andnot_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b));
     __m128i c = *(const __m128i *) &fc;
     // now for the assertion...
     const uint32_t *ia = (const uint32_t *) &a;
@@ -5521,7 +5521,7 @@ result_t test_mm_or_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
     const int32_t *_b = impl.mTestIntPointer2;
     __m128i a = load_m128i(_a);
     __m128i b = load_m128i(_b);
-    __m128 fc = _mm_or_ps(*(const __m128 *) &a, *(const __m128 *) &b);
+    __m128 fc = _mm_or_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b));
     __m128i c = *(const __m128i *) &fc;
     // now for the assertion...
     const uint32_t *ia = (const uint32_t *) &a;