Skip to content

Commit

Permalink
refactor: Use fesetround() and fegetround()
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Sep 22, 2024
1 parent d1fe9f2 commit 74da60f
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 193 deletions.
239 changes: 48 additions & 191 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@
#warning "Optimization may cause potential errors in sse2neon. see #648"
#endif


/* C language does not allow initializing a variable with a function call. */
#ifdef __cplusplus
#define _sse2neon_const static const
#else
#define _sse2neon_const const
#endif

#include <fenv.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -193,10 +193,7 @@ FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
#define _sse2neon_return(ret) return ret
#endif

#define _sse2neon_init(...) \
{ \
__VA_ARGS__ \
}
#define _sse2neon_init(...) {__VA_ARGS__}

/* Compiler barrier */
#if defined(_MSC_VER) && !defined(__clang__)
Expand Down Expand Up @@ -568,12 +565,6 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])

/* SSE macros */
#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode

// Function declaration
// SSE
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
Expand Down Expand Up @@ -1806,32 +1797,8 @@ FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
#if defined(_MSC_VER) && !defined(__clang__)
_WriteStatusReg(ARM64_FPCR, value);
#else
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
#endif
}

// Macro: Get the flush zero bits from the MXCSR control and status register.
// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
// _MM_FLUSH_ZERO_OFF
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
#endif

return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
}

// Macro: Get the rounding mode bits from the MXCSR control and status register.
Expand All @@ -1840,25 +1807,17 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

if (r.field.bit22) {
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
} else {
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
switch (fegetround()) {
case FE_TONEAREST:
return _MM_ROUND_NEAREST;
case FE_DOWNWARD:
return _MM_ROUND_DOWN;
case FE_UPWARD:
return _MM_ROUND_UP;
case FE_TOWARDZERO:
return _MM_ROUND_TOWARD_ZERO;
default: // FIXME
return _MM_ROUND_TOWARD_ZERO;
}
}

Expand Down Expand Up @@ -2398,38 +2357,6 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
}

// Macro: Set the flush zero bits of the MXCSR control and status register to
// the value in unsigned 32-bit integer a. The flush zero may contain any of the
// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
{
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
// regardless of the value of the FZ bit.
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;

#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
}

// Set packed single-precision (32-bit) floating-point elements in dst with the
// supplied values.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
Expand All @@ -2454,44 +2381,23 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

switch (rounding) {
case _MM_ROUND_TOWARD_ZERO:
r.field.bit22 = 1;
r.field.bit23 = 1;
case _MM_ROUND_NEAREST:
rounding = FE_TONEAREST;
break;
case _MM_ROUND_DOWN:
r.field.bit22 = 0;
r.field.bit23 = 1;
rounding = FE_DOWNWARD;
break;
case _MM_ROUND_UP:
r.field.bit22 = 1;
r.field.bit23 = 0;
rounding = FE_UPWARD;
break;
default: //_MM_ROUND_NEAREST
r.field.bit22 = 0;
r.field.bit23 = 0;
case _MM_ROUND_TOWARD_ZERO:
rounding = FE_TOWARDZERO;
break;
default: // FIXME
rounding = FE_TOWARDZERO;
}

#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
fesetround(rounding);
}

// Copy single-precision (32-bit) floating-point element a to the lower element
Expand Down Expand Up @@ -4990,11 +4896,11 @@ FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
signed char b1,
signed char b0)
{
int8_t ALIGN_STRUCT(16)
data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
int8_t ALIGN_STRUCT(16) data[16] = {
(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
return (__m128i) vld1q_s8(data);
}

Expand Down Expand Up @@ -5125,11 +5031,11 @@ FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
signed char b14,
signed char b15)
{
int8_t ALIGN_STRUCT(16)
data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
int8_t ALIGN_STRUCT(16) data[16] = {
(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
return (__m128i) vld1q_s8(data);
}

Expand Down Expand Up @@ -6282,7 +6188,7 @@ FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
uint8x8_t tmp_low; \
uint8x8_t tmp_high; \
if ((imm) >= 8) { \
const int idx = (imm) -8; \
const int idx = (imm) - 8; \
tmp_low = vreinterpret_u8_m64(_a); \
tmp_high = vdup_n_u8(0); \
ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
Expand Down Expand Up @@ -6803,14 +6709,14 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
_sse2neon_define2( \
__m128i, a, b, \
const uint16_t _mask[8] = \
_sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
_sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 1)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 2)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 3)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 4)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 5)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 6)) ? (uint16_t) - 1 : 0x0, \
((imm) & (1 << 7)) ? (uint16_t) - 1 : 0x0); \
uint16x8_t _mask_vec = vld1q_u16(_mask); \
uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
Expand All @@ -6835,11 +6741,11 @@ FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
{
const uint32_t ALIGN_STRUCT(16)
data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
((imm8) & (1 << 1)) ? UINT32_MAX : 0,
((imm8) & (1 << 2)) ? UINT32_MAX : 0,
((imm8) & (1 << 3)) ? UINT32_MAX : 0};
const uint32_t
ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
((imm8) & (1 << 1)) ? UINT32_MAX : 0,
((imm8) & (1 << 2)) ? UINT32_MAX : 0,
((imm8) & (1 << 3)) ? UINT32_MAX : 0};
uint32x4_t mask = vld1q_u32(data);
float32x4_t a = vreinterpretq_f32_m128(_a);
float32x4_t b = vreinterpretq_f32_m128(_b);
Expand Down Expand Up @@ -9261,26 +9167,6 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
}
}

FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
}

// Count the number of bits set to 1 in unsigned 32-bit integer a, and
// return that count in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
Expand Down Expand Up @@ -9340,35 +9226,6 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
#endif
}

FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(
unsigned int flag)
{
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
// regardless of the value of the FZ bit.
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;

#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
}

// Return the current 64-bit value of the processor's time-stamp counter.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
FORCE_INLINE uint64_t _rdtsc(void)
Expand Down
6 changes: 4 additions & 2 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4793,7 +4793,8 @@ result_t test_mm_cvttpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
return validateInt32(ret, d0, d1, 0, 0);
}

OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter)
OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl,
uint32_t iter)
{
const double *_a = (const double *) impl.mTestFloatPointer1;

Expand Down Expand Up @@ -5877,7 +5878,7 @@ result_t test_mm_shuffle_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
int32_t _d[4];

#define TEST_IMPL(IDX) \
_d[0] = _a[((IDX) &0x3)]; \
_d[0] = _a[((IDX) & 0x3)]; \
_d[1] = _a[((IDX >> 2) & 0x3)]; \
_d[2] = _a[((IDX >> 4) & 0x3)]; \
_d[3] = _a[((IDX >> 6) & 0x3)]; \
Expand Down Expand Up @@ -8957,6 +8958,7 @@ OPTNONE result_t test_mm_round_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
__m128d ret;

__m128d a = load_m128d(_a);

switch (iter & 0x7) {
case 0:
d[0] = bankersRounding(_a[0]);
Expand Down

0 comments on commit 74da60f

Please sign in to comment.