From c917fce25418217312fffb58fe7e96da4a6a5e9c Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Sun, 22 Sep 2024 03:21:02 +0200 Subject: [PATCH] refactor: Use fesetround() and fegetround() Setting/getting roudning mode directly through fpcr with volatile keyword could be unstable. Therefore we use the C99 fesetround()/fegetround() here to ensure the behavior. --- sse2neon.h | 76 +++++++++++++++----------------------------------- tests/impl.cpp | 4 ++- 2 files changed, 26 insertions(+), 54 deletions(-) diff --git a/sse2neon.h b/sse2neon.h index c2228667..1bc4e92b 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -114,7 +114,6 @@ #warning "Optimization may cause potential errors in sse2neon. see #648" #endif - /* C language does not allow initializing a variable with a function call. */ #ifdef __cplusplus #define _sse2neon_const static const @@ -122,6 +121,7 @@ #define _sse2neon_const const #endif +#include #include #include #include @@ -1840,25 +1840,17 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - - if (r.field.bit22) { - return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; - } else { - return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; + switch (fegetround()) { + case FE_TONEAREST: + return _MM_ROUND_NEAREST; + case FE_DOWNWARD: + return _MM_ROUND_DOWN; + case FE_UPWARD: + return _MM_ROUND_UP; + case FE_TOWARDZERO: + return _MM_ROUND_TOWARD_ZERO; + default: // FIXME + return _MM_ROUND_TOWARD_ZERO; } } @@ -2454,44 +2446,23 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { - union { - fpcr_bitfield field; -#if defined(__aarch64__) || defined(_M_ARM64) - uint64_t value; -#else - uint32_t value; -#endif - } r; - -#if defined(__aarch64__) || defined(_M_ARM64) - r.value = _sse2neon_get_fpcr(); -#else - __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ -#endif - switch (rounding) { - case _MM_ROUND_TOWARD_ZERO: - r.field.bit22 = 1; - r.field.bit23 = 1; + case _MM_ROUND_NEAREST: + rounding = FE_TONEAREST; break; case _MM_ROUND_DOWN: - r.field.bit22 = 0; - r.field.bit23 = 1; + rounding = FE_DOWNWARD; break; case _MM_ROUND_UP: - r.field.bit22 = 1; - r.field.bit23 = 0; + rounding = FE_UPWARD; break; - default: //_MM_ROUND_NEAREST - r.field.bit22 = 0; - r.field.bit23 = 0; + case _MM_ROUND_TOWARD_ZERO: + rounding = FE_TOWARDZERO; + break; + default: // FIXME + rounding = FE_TOWARDZERO; } - -#if defined(__aarch64__) || defined(_M_ARM64) - _sse2neon_set_fpcr(r.value); -#else - __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ -#endif + fesetround(rounding); } // Copy single-precision (32-bit) floating-point element a to the lower element @@ -9340,8 +9311,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) #endif } -FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode( - unsigned int flag) +FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. diff --git a/tests/impl.cpp b/tests/impl.cpp index f29a4ae0..2cc09e30 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -4793,7 +4793,8 @@ result_t test_mm_cvttpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter) return validateInt32(ret, d0, d1, 0, 0); } -OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter) +OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, + uint32_t iter) { const double *_a = (const double *) impl.mTestFloatPointer1; @@ -8957,6 +8958,7 @@ OPTNONE result_t test_mm_round_pd(const SSE2NEONTestImpl &impl, uint32_t iter) __m128d ret; __m128d a = load_m128d(_a); + switch (iter & 0x7) { case 0: d[0] = bankersRounding(_a[0]);