diff --git a/sse2neon.h b/sse2neon.h
index 8ecfc753..a33ef301 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -2444,7 +2444,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
 // _MM_ROUND_TOWARD_ZERO
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding)
 {
     union {
         fpcr_bitfield field;
@@ -4158,7 +4158,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers with truncation, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
-FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a)
 {
     double a0 = ((double *) &a)[0];
     double a1 = ((double *) &a)[1];
@@ -9219,7 +9219,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
 #endif
 }
 
-FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
 {
     // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
     // regardless of the value of the FZ bit.
diff --git a/tests/common.h b/tests/common.h
index 163d4e68..fe8cb0b3 100644
--- a/tests/common.h
+++ b/tests/common.h
@@ -67,6 +67,16 @@ extern int64_t NaN64;
 #define ALL_BIT_1_32 (*(float *) &NaN)
 #define ALL_BIT_1_64 (*(double *) &NaN64)
 
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma push_macro("OPTNONE")
+#define OPTNONE __attribute__((optimize("O0")))
+#elif defined(__clang__)
+#pragma push_macro("OPTNONE")
+#define OPTNONE __attribute__((optnone))
+#else
+#define OPTNONE
+#endif
+
 template <typename T>
 result_t validate128(T a, T b)
 {
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 74330f5c..48a5398c 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -2751,7 +2751,7 @@ result_t test_mm_set_ps1(const SSE2NEONTestImpl &impl, uint32_t iter)
     return validateFloat(ret, a, a, a, a);
 }
 
-result_t test_mm_set_rounding_mode(const SSE2NEONTestImpl &impl, uint32_t iter)
+OPTNONE result_t test_mm_set_rounding_mode(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const float *_a = impl.mTestFloatPointer1;
     result_t res_toward_zero, res_to_neg_inf, res_to_pos_inf, res_nearest;
@@ -4444,7 +4444,7 @@ result_t test_mm_cvtepi32_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     return validateFloat(ret, trun[0], trun[1], trun[2], trun[3]);
 }
 
-result_t test_mm_cvtpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
+OPTNONE result_t test_mm_cvtpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const double *_a = (const double *) impl.mTestFloatPointer1;
     int32_t d[2] = {};
@@ -8425,7 +8425,7 @@ result_t test_mm_cvtepu8_epi64(const SSE2NEONTestImpl &impl, uint32_t iter)
     MM_DP_PD_TEST_CASE_WITH(0x22);   \
     MM_DP_PD_TEST_CASE_WITH(0x23);
 
-result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
+OPTNONE result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     GENERATE_MM_DP_PD_TEST_CASES
     return TEST_SUCCESS;
@@ -8460,7 +8460,7 @@ result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     MM_DP_PS_TEST_CASE_WITH(0x23);   \
     MM_DP_PS_TEST_CASE_WITH(0xB5);
 
-result_t test_mm_dp_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
+OPTNONE result_t test_mm_dp_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     GENERATE_MM_DP_PS_TEST_CASES
     return TEST_SUCCESS;
@@ -11819,8 +11819,8 @@ result_t test_mm_popcnt_u64(const SSE2NEONTestImpl &impl, uint32_t iter)
     return TEST_SUCCESS;
 }
 
-result_t test_mm_set_denormals_zero_mode(const SSE2NEONTestImpl &impl,
-                                         uint32_t iter)
+OPTNONE result_t test_mm_set_denormals_zero_mode(const SSE2NEONTestImpl &impl,
+                                                 uint32_t iter)
 {
     result_t res_set_denormals_zero_on, res_set_denormals_zero_off;
     float factor = 2;