From c7af0095b759b0ed3d96e3bbe5de38510414fde1 Mon Sep 17 00:00:00 2001
From: Jim Huang <jserv@biilabs.io>
Date: Sat, 5 Jun 2021 03:27:19 +0800
Subject: [PATCH] Drop FMA intrinsic

Danila Kutenin pointed out:
> Technically speaking, _mm_fmadd_ps is not an SSE extension, this was
> introduced with fma extension which took place even after AVX.

To clarify the purpose of SSE2NEON, this pach would drop the existing
FMA implementation.

Related: #82
---
 sse2neon.h     | 28 +++++++---------------------
 tests/impl.cpp |  6 ------
 tests/impl.h   |  2 --
 3 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/sse2neon.h b/sse2neon.h
index 3d7f92b2..ff9d7b55 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -359,8 +359,6 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
 FORCE_INLINE __m128 _mm_round_ps(__m128, int);
 // SSE4.2
 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
-// FMA
-FORCE_INLINE __m128 _mm_fmadd_ps(__m128, __m128, __m128);
 
 /* Backwards compatibility for compilers with lack of specific type support */
 
@@ -6025,7 +6023,13 @@ FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
 {
     __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
-    return _mm_fmadd_ps(b, mask, a);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
 }
 
 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
@@ -8003,24 +8007,6 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
 }
 #endif
 
-/* FMA */
-
-// Computes the fused multiple add product of 32-bit floating point numbers.
-//
-// Return Value
-// Multiplies A and B, and adds C to the temporary result before returning it.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
-FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
-                                            vreinterpretq_f32_m128(b),
-                                            vreinterpretq_f32_m128(a)));
-#else
-    return _mm_add_ps(_mm_mul_ps(a, b), c);
-#endif
-}
-
 /* Others */
 
 // Perform a carry-less multiplication of two 64-bit integers, selected from a
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 844ef14c..940b46d7 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -8773,12 +8773,6 @@ result_t test_mm_aeskeygenassist_si128(const SSE2NEONTestImpl &impl, uint32_t i)
     return validate128(resultReference, resultIntrinsic);
 }
 
-/* FMA */
-result_t test_mm_fmadd_ps(const SSE2NEONTestImpl &impl, uint32_t i)
-{
-    return TEST_UNIMPL;
-}
-
 /* Others */
 result_t test_mm_clmulepi64_si128(const SSE2NEONTestImpl &impl, uint32_t i)
 {
diff --git a/tests/impl.h b/tests/impl.h
index 80b60367..f52dc06c 100644
--- a/tests/impl.h
+++ b/tests/impl.h
@@ -526,8 +526,6 @@
     TYPE(mm_aesenc_si128)          \
     TYPE(mm_aesenclast_si128)      \
     TYPE(mm_aeskeygenassist_si128) \
-    /* FMA */                      \
-    TYPE(mm_fmadd_ps)              \
     /* Others */                   \
     TYPE(mm_clmulepi64_si128)      \
     TYPE(mm_popcnt_u32)            \