Skip to content

Commit

Permalink
Merge pull request #642 from howjmay/test-opt
Browse files Browse the repository at this point in the history
Allow optimization and use fesetround(), fegetround()
  • Loading branch information
jserv authored Oct 8, 2024
2 parents 706d3b5 + 03f1e3c commit d1562a7
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 76 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,15 @@ $ make \

Check the details via [Test Suite for SSE2NEON](tests/README.md).

### Optimization

The SSE2NEON project is designed with performance-sensitive scenarios in mind, and as such, optimization options (e.g. `O1`, `O2`) can lead to misbehavior under specific circumstances. For example, frequent changes to the rounding mode or repeated calls to `_MM_SET_DENORMALS_ZERO_MODE()` may introduce unintended behavior.

Enforcing no optimizations for specific intrinsics could solve these boundary cases but may negatively impact general performance. Therefore, we have decided to prioritize performance and shift the responsibility for handling such edge cases to developers.

It is important to be aware of these potential pitfalls when enabling optimizations and ensure that your code accounts for these scenarios if necessary.


## Adoptions
Here is a partial list of open source projects that have adopted `sse2neon` for Arm/Aarch64 support.
* [Aaru Data Preservation Suite](https://www.aaru.app/) is a fully-featured software package to preserve all storage media from the very old to the cutting edge, as well as to give detailed information about any supported image file (whether from Aaru or not) and to extract the files from those images.
Expand Down
114 changes: 42 additions & 72 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,28 +106,23 @@
#pragma message("Macro name collisions may happen with unsupported compilers.")
#endif


#if defined(__GNUC__) && !defined(__clang__)
#pragma push_macro("FORCE_INLINE_OPTNONE")
#define FORCE_INLINE_OPTNONE static inline __attribute__((optimize("O0")))
#elif defined(__clang__)
#pragma push_macro("FORCE_INLINE_OPTNONE")
#define FORCE_INLINE_OPTNONE static inline __attribute__((optnone))
#else
#define FORCE_INLINE_OPTNONE FORCE_INLINE
#endif

#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
#warning "GCC versions earlier than 10 are not supported."
#endif

#ifdef __OPTIMIZE__
#warning \
"Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
#endif

/* C language does not allow initializing a variable with a function call. */
#ifdef __cplusplus
#define _sse2neon_const static const
#else
#define _sse2neon_const const
#endif

#include <fenv.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -604,8 +599,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
FORCE_INLINE __m128 _mm_ceil_ps(__m128);
FORCE_INLINE __m128d _mm_floor_pd(__m128d);
FORCE_INLINE __m128 _mm_floor_ps(__m128);
FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d, int);
FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128, int);
FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
FORCE_INLINE __m128 _mm_round_ps(__m128, int);
// SSE4.2
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);

Expand Down Expand Up @@ -1846,25 +1841,20 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

if (r.field.bit22) {
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
} else {
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
switch (fegetround()) {
case FE_TONEAREST:
return _MM_ROUND_NEAREST;
case FE_DOWNWARD:
return _MM_ROUND_DOWN;
case FE_UPWARD:
return _MM_ROUND_UP;
case FE_TOWARDZERO:
return _MM_ROUND_TOWARD_ZERO;
default:
// fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
// cases we treat them as FE_TOWARDZERO (truncate).
return _MM_ROUND_TOWARD_ZERO;
}
}

Expand Down Expand Up @@ -2458,46 +2448,28 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
// _MM_ROUND_TOWARD_ZERO
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
FORCE_INLINE_OPTNONE void _MM_SET_ROUNDING_MODE(int rounding)
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
uint64_t value;
#else
uint32_t value;
#endif
} r;

#if defined(__aarch64__) || defined(_M_ARM64)
r.value = _sse2neon_get_fpcr();
#else
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif

switch (rounding) {
case _MM_ROUND_TOWARD_ZERO:
r.field.bit22 = 1;
r.field.bit23 = 1;
case _MM_ROUND_NEAREST:
rounding = FE_TONEAREST;
break;
case _MM_ROUND_DOWN:
r.field.bit22 = 0;
r.field.bit23 = 1;
rounding = FE_DOWNWARD;
break;
case _MM_ROUND_UP:
r.field.bit22 = 1;
r.field.bit23 = 0;
rounding = FE_UPWARD;
break;
default: //_MM_ROUND_NEAREST
r.field.bit22 = 0;
r.field.bit23 = 0;
case _MM_ROUND_TOWARD_ZERO:
rounding = FE_TOWARDZERO;
break;
default:
// rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
// _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
// FE_TOWARDZERO (truncate).
rounding = FE_TOWARDZERO;
}

#if defined(__aarch64__) || defined(_M_ARM64)
_sse2neon_set_fpcr(r.value);
#else
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
fesetround(rounding);
}

// Copy single-precision (32-bit) floating-point element a to the lower element
Expand Down Expand Up @@ -3899,7 +3871,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
// Convert packed double-precision (64-bit) floating-point elements in a to
// packed 32-bit integers, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
{
// vrnd32xq_f64 not supported on clang
#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
Expand All @@ -3921,7 +3893,7 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
// Convert packed double-precision (64-bit) floating-point elements in a to
// packed 32-bit integers, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
{
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0, d1;
Expand Down Expand Up @@ -4217,7 +4189,7 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
// Convert packed double-precision (64-bit) floating-point elements in a to
// packed 32-bit integers with truncation, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
FORCE_INLINE_OPTNONE __m64 _mm_cvttpd_pi32(__m128d a)
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
{
double a0, a1;
a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
Expand Down Expand Up @@ -7559,7 +7531,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
// the rounding parameter, and store the results as packed double-precision
// floating-point elements in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding)
FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
{
#if defined(__aarch64__) || defined(_M_ARM64)
switch (rounding) {
Expand Down Expand Up @@ -7628,7 +7600,7 @@ FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding)
// the rounding parameter, and store the results as packed single-precision
// floating-point elements in dst.
// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128 a, int rounding)
FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
{
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
Expand Down Expand Up @@ -9346,8 +9318,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
#endif
}

FORCE_INLINE_OPTNONE void _sse2neon_mm_set_denormals_zero_mode(
unsigned int flag)
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
{
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
// regardless of the value of the FZ bit.
Expand Down Expand Up @@ -9419,7 +9390,6 @@ FORCE_INLINE uint64_t _rdtsc(void)
#if defined(__GNUC__) || defined(__clang__)
#pragma pop_macro("ALIGN_STRUCT")
#pragma pop_macro("FORCE_INLINE")
#pragma pop_macro("FORCE_INLINE_OPTNONE")
#endif

#if defined(__GNUC__) && !defined(__clang__)
Expand Down
9 changes: 5 additions & 4 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4485,7 +4485,7 @@ OPTNONE result_t test_mm_cvtpd_epi32(const SSE2NEONTestImpl &impl,
return validateInt32(ret, d[0], d[1], 0, 0);
}

result_t test_mm_cvtpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter)
OPTNONE result_t test_mm_cvtpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const double *_a = (const double *) impl.mTestFloatPointer1;
int32_t d[2] = {};
Expand Down Expand Up @@ -4793,7 +4793,8 @@ result_t test_mm_cvttpd_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
return validateInt32(ret, d0, d1, 0, 0);
}

result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl, uint32_t iter)
OPTNONE result_t test_mm_cvttpd_pi32(const SSE2NEONTestImpl &impl,
uint32_t iter)
{
const double *_a = (const double *) impl.mTestFloatPointer1;

Expand Down Expand Up @@ -8950,7 +8951,7 @@ result_t test_mm_packus_epi32(const SSE2NEONTestImpl &impl, uint32_t iter)
return VALIDATE_UINT16_M128(c, d);
}

result_t test_mm_round_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
OPTNONE result_t test_mm_round_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const double *_a = (double *) impl.mTestFloatPointer1;
double d[2] = {};
Expand Down Expand Up @@ -9015,7 +9016,7 @@ result_t test_mm_round_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
return validateDouble(ret, d[0], d[1]);
}

result_t test_mm_round_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
OPTNONE result_t test_mm_round_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
{
const float *_a = impl.mTestFloatPointer1;
float f[4] = {};
Expand Down

0 comments on commit d1562a7

Please sign in to comment.