diff --git a/sse2neon.h b/sse2neon.h index 8ecfc753..a33ef301 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -2444,7 +2444,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w) // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, // _MM_ROUND_TOWARD_ZERO // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE -FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) +FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding) { union { fpcr_bitfield field; @@ -4158,7 +4158,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 -FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) +FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a) { double a0 = ((double *) &a)[0]; double a1 = ((double *) &a)[1]; @@ -9219,7 +9219,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) #endif } -FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) +FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. diff --git a/tests/common.h b/tests/common.h index 163d4e68..fe8cb0b3 100644 --- a/tests/common.h +++ b/tests/common.h @@ -67,6 +67,16 @@ extern int64_t NaN64; #define ALL_BIT_1_32 (*(float *) &NaN) #define ALL_BIT_1_64 (*(double *) &NaN64) +#if defined(__GNUC__) && !defined(__clang__) +#pragma push_macro("OPTNONE") +#define OPTNONE __attribute__((optimize("O0"))) +#elif defined(__clang__) +#pragma push_macro("OPTNONE") +#define OPTNONE __attribute__((optnone)) +#else +#define OPTNONE +#endif + template result_t validate128(T a, T b) { diff --git a/tests/impl.cpp b/tests/impl.cpp index 74330f5c..48a5398c 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -2751,7 +2751,7 @@ result_t test_mm_set_ps1(const SSE2NEONTestImpl &impl, uint32_t iter) return validateFloat(ret, a, a, a, a); } -result_t test_mm_set_rounding_mode(const SSE2NEONTestImpl &impl, uint32_t iter) +OPTNONE result_t test_mm_set_rounding_mode(const SSE2NEONTestImpl &impl, uint32_t iter) { const float *_a = impl.mTestFloatPointer1; result_t res_toward_zero, res_to_neg_inf, res_to_pos_inf, res_nearest; @@ -4444,7 +4444,7 @@ result_t test_mm_cvtepi32_ps(const SSE2NEONTestImpl &impl, uint32_t iter) return validateFloat(ret, trun[0], trun[1], trun[2], trun[3]); } -result_t test_mm_cvtpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter) +OPTNONE result_t test_mm_cvtpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; int32_t d[2] = {}; @@ -8425,7 +8425,7 @@ result_t test_mm_cvtepu8_epi64(const SSE2NEONTestImpl &impl, uint32_t iter) MM_DP_PD_TEST_CASE_WITH(0x22); \ MM_DP_PD_TEST_CASE_WITH(0x23); -result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter) +OPTNONE result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter) { GENERATE_MM_DP_PD_TEST_CASES return TEST_SUCCESS; @@ -8460,7 +8460,7 @@ result_t test_mm_dp_pd(const SSE2NEONTestImpl &impl, uint32_t iter) MM_DP_PS_TEST_CASE_WITH(0x23); \ MM_DP_PS_TEST_CASE_WITH(0xB5); -result_t test_mm_dp_ps(const SSE2NEONTestImpl &impl, uint32_t iter) +OPTNONE result_t test_mm_dp_ps(const SSE2NEONTestImpl &impl, uint32_t iter) { GENERATE_MM_DP_PS_TEST_CASES return TEST_SUCCESS; @@ -11819,8 +11819,8 @@ result_t test_mm_popcnt_u64(const SSE2NEONTestImpl &impl, uint32_t iter) return TEST_SUCCESS; } -result_t test_mm_set_denormals_zero_mode(const SSE2NEONTestImpl &impl, - uint32_t iter) +OPTNONE result_t test_mm_set_denormals_zero_mode(const SSE2NEONTestImpl &impl, + uint32_t iter) { result_t res_set_denormals_zero_on, res_set_denormals_zero_off; float factor = 2;