From c7af0095b759b0ed3d96e3bbe5de38510414fde1 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 5 Jun 2021 03:27:19 +0800 Subject: [PATCH] Drop FMA intrinsic Danila Kutenin pointed out: > Technically speaking, _mm_fmadd_ps is not an SSE extension, this was > introduced with fma extension which took place even after AVX. To clarify the purpose of SSE2NEON, this pach would drop the existing FMA implementation. Related: #82 --- sse2neon.h | 28 +++++++--------------------- tests/impl.cpp | 6 ------ tests/impl.h | 2 -- 3 files changed, 7 insertions(+), 29 deletions(-) diff --git a/sse2neon.h b/sse2neon.h index 3d7f92b2..ff9d7b55 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -359,8 +359,6 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d, int); FORCE_INLINE __m128 _mm_round_ps(__m128, int); // SSE4.2 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); -// FMA -FORCE_INLINE __m128 _mm_fmadd_ps(__m128, __m128, __m128); /* Backwards compatibility for compilers with lack of specific type support */ @@ -6025,7 +6023,13 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; - return _mm_fmadd_ps(b, mask, a); +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), + vreinterpretq_f32_m128(mask), + vreinterpretq_f32_m128(b))); +#else + return _mm_add_ps(_mm_mul_ps(b, mask), a); +#endif } // Horizontally add adjacent pairs of double-precision (64-bit) floating-point @@ -8003,24 +8007,6 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) } #endif -/* FMA */ - -// Computes the fused multiple add product of 32-bit floating point numbers. -// -// Return Value -// Multiplies A and B, and adds C to the temporary result before returning it. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd -FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), - vreinterpretq_f32_m128(b), - vreinterpretq_f32_m128(a))); -#else - return _mm_add_ps(_mm_mul_ps(a, b), c); -#endif -} - /* Others */ // Perform a carry-less multiplication of two 64-bit integers, selected from a diff --git a/tests/impl.cpp b/tests/impl.cpp index 844ef14c..940b46d7 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -8773,12 +8773,6 @@ result_t test_mm_aeskeygenassist_si128(const SSE2NEONTestImpl &impl, uint32_t i) return validate128(resultReference, resultIntrinsic); } -/* FMA */ -result_t test_mm_fmadd_ps(const SSE2NEONTestImpl &impl, uint32_t i) -{ - return TEST_UNIMPL; -} - /* Others */ result_t test_mm_clmulepi64_si128(const SSE2NEONTestImpl &impl, uint32_t i) { diff --git a/tests/impl.h b/tests/impl.h index 80b60367..f52dc06c 100644 --- a/tests/impl.h +++ b/tests/impl.h @@ -526,8 +526,6 @@ TYPE(mm_aesenc_si128) \ TYPE(mm_aesenclast_si128) \ TYPE(mm_aeskeygenassist_si128) \ - /* FMA */ \ - TYPE(mm_fmadd_ps) \ /* Others */ \ TYPE(mm_clmulepi64_si128) \ TYPE(mm_popcnt_u32) \