From e4dfd7c35d24bdb6e3c5e41c281326cfc01ca3c6 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 30 Oct 2024 21:49:26 +0000 Subject: [PATCH] Improve: Special cases for Weighted Sums The `wsum` is often used to add two vectors with Alpha and Beta equal to 1 or to scale just one vector with Alpha or Beta equal to zero. Those special cases are now even faster on x86 machines. --- include/simsimd/elementwise.h | 848 ++++++++++++++++++++++++++++++++-- 1 file changed, 811 insertions(+), 37 deletions(-) diff --git a/include/simsimd/elementwise.h b/include/simsimd/elementwise.h index 8ece3b6c..46a5b71c 100644 --- a/include/simsimd/elementwise.h +++ b/include/simsimd/elementwise.h @@ -270,11 +270,60 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_sapphire( #pragma GCC target("avx2", "f16c", "fma") #pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function) +SIMSIMD_PUBLIC void simsimd_add_f32_haswell(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, + simsimd_f32_t *result) { + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + __m256 a_vec = _mm256_loadu_ps(a + i); + __m256 b_vec = _mm256_loadu_ps(b + i); + __m256 sum_vec = _mm256_add_ps(a_vec, b_vec); + _mm256_storeu_ps(result + i, sum_vec); + } + + // The tail: + for (; i < n; ++i) result[i] = a[i] + b[i]; +} + +SIMSIMD_PUBLIC void simsimd_scale_f32_haswell(simsimd_f32_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_f32_t *result) { + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); + + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + __m256 a_vec = _mm256_loadu_ps(a + i); + __m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec); + _mm256_storeu_ps(result + i, sum_vec); + } + + // The tail: + for (; i < n; ++i) result[i] = alpha_f32 * a[i]; +} + SIMSIMD_PUBLIC void simsimd_wsum_f32_haswell( // simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result) { simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; + + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_f32_haswell(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_f32_haswell(a, n, alpha, result); } + else { simsimd_scale_f32_haswell(b, n, beta, result); } + return; + } + + // The general case. __m256 alpha_vec = _mm256_set1_ps(alpha_f32); __m256 beta_vec = _mm256_set1_ps(beta_f32); @@ -283,9 +332,9 @@ SIMSIMD_PUBLIC void simsimd_wsum_f32_haswell( // for (; i + 8 <= n; i += 8) { __m256 a_vec = _mm256_loadu_ps(a + i); __m256 b_vec = _mm256_loadu_ps(b + i); - __m256 a_scaled = _mm256_mul_ps(a_vec, alpha_vec); - __m256 b_scaled = _mm256_mul_ps(b_vec, beta_vec); - __m256 sum_vec = _mm256_add_ps(a_scaled, b_scaled); + __m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec); + __m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec); + __m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec); _mm256_storeu_ps(result + i, sum_vec); } @@ -293,9 +342,57 @@ SIMSIMD_PUBLIC void simsimd_wsum_f32_haswell( // for (; i < n; ++i) result[i] = alpha_f32 * a[i] + beta_f32 * b[i]; } +SIMSIMD_PUBLIC void simsimd_add_f64_haswell(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, + simsimd_f64_t *result) { + // The main loop: + simsimd_size_t i = 0; + for (; i + 4 <= n; i += 4) { + __m256d a_vec = _mm256_loadu_pd(a + i); + __m256d b_vec = _mm256_loadu_pd(b + i); + __m256d sum_vec = _mm256_add_pd(a_vec, b_vec); + _mm256_storeu_pd(result + i, sum_vec); + } + + // The tail: + for (; i < n; ++i) result[i] = a[i] + b[i]; +} + +SIMSIMD_PUBLIC void simsimd_scale_f64_haswell(simsimd_f64_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_f64_t *result) { + __m256d alpha_vec = _mm256_set1_pd(alpha); + + // The main loop: + simsimd_size_t i = 0; + for (; i + 4 <= n; i += 4) { + __m256d a_vec = _mm256_loadu_pd(a + i); + __m256d sum_vec = _mm256_mul_pd(a_vec, alpha_vec); + _mm256_storeu_pd(result + i, sum_vec); + } + + // The tail: + for (; i < n; ++i) result[i] = alpha * a[i]; +} + SIMSIMD_PUBLIC void simsimd_wsum_f64_haswell( // simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result) { + + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_f64_haswell(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_f64_haswell(a, n, alpha, result); } + else { simsimd_scale_f64_haswell(b, n, beta, result); } + return; + } + + // The general case. __m256d alpha_vec = _mm256_set1_pd(alpha); __m256d beta_vec = _mm256_set1_pd(beta); @@ -304,9 +401,9 @@ SIMSIMD_PUBLIC void simsimd_wsum_f64_haswell( // for (; i + 4 <= n; i += 4) { __m256d a_vec = _mm256_loadu_pd(a + i); __m256d b_vec = _mm256_loadu_pd(b + i); - __m256d a_scaled = _mm256_mul_pd(a_vec, alpha_vec); - __m256d b_scaled = _mm256_mul_pd(b_vec, beta_vec); - __m256d sum_vec = _mm256_add_pd(a_scaled, b_scaled); + __m256d a_scaled_vec = _mm256_mul_pd(a_vec, alpha_vec); + __m256d b_scaled_vec = _mm256_mul_pd(b_vec, beta_vec); + __m256d sum_vec = _mm256_add_pd(a_scaled_vec, b_scaled_vec); _mm256_storeu_pd(result + i, sum_vec); } @@ -314,9 +411,73 @@ SIMSIMD_PUBLIC void simsimd_wsum_f64_haswell( // for (; i < n; ++i) result[i] = alpha * a[i] + beta * b[i]; } +SIMSIMD_PUBLIC void simsimd_add_f16_haswell(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, + simsimd_f16_t *result) { + + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + __m128i a_f16 = _mm_lddqu_si128((__m128i const *)(a + i)); + __m128i b_f16 = _mm_lddqu_si128((__m128i const *)(b + i)); + __m256 a_vec = _mm256_cvtph_ps(a_f16); + __m256 b_vec = _mm256_cvtph_ps(b_f16); + __m256 sum_vec = _mm256_add_ps(a_vec, b_vec); + __m128i sum_f16 = _mm256_cvtps_ph(sum_vec, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm_storeu_si128((__m128i *)(result + i), sum_f16); + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i); + simsimd_f32_t bi = SIMSIMD_F16_TO_F32(b + i); + simsimd_f32_t sum = ai + bi; + SIMSIMD_F32_TO_F16(sum, result + i); + } +} + +SIMSIMD_PUBLIC void simsimd_scale_f16_haswell(simsimd_f16_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_f16_t *result) { + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); + + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + __m128i a_f16 = _mm_lddqu_si128((__m128i const *)(a + i)); + __m256 a_vec = _mm256_cvtph_ps(a_f16); + __m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec); + __m128i sum_f16 = _mm256_cvtps_ph(sum_vec, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm_storeu_si128((__m128i *)(result + i), sum_f16); + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i); + simsimd_f32_t sum = alpha_f32 * ai; + SIMSIMD_F32_TO_F16(sum, result + i); + } +} + SIMSIMD_PUBLIC void simsimd_wsum_f16_haswell( // simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result) { + + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_f16_haswell(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_f16_haswell(a, n, alpha, result); } + else { simsimd_scale_f16_haswell(b, n, beta, result); } + return; + } + + // The general case. simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; __m256 alpha_vec = _mm256_set1_ps(alpha_f32); @@ -325,13 +486,13 @@ SIMSIMD_PUBLIC void simsimd_wsum_f16_haswell( // // The main loop: simsimd_size_t i = 0; for (; i + 8 <= n; i += 8) { - __m128i a_f16 = _mm_loadu_si128((__m128i const *)(a + i)); - __m128i b_f16 = _mm_loadu_si128((__m128i const *)(b + i)); + __m128i a_f16 = _mm_lddqu_si128((__m128i const *)(a + i)); + __m128i b_f16 = _mm_lddqu_si128((__m128i const *)(b + i)); __m256 a_vec = _mm256_cvtph_ps(a_f16); __m256 b_vec = _mm256_cvtph_ps(b_f16); - __m256 a_scaled = _mm256_mul_ps(a_vec, alpha_vec); - __m256 b_scaled = _mm256_mul_ps(b_vec, beta_vec); - __m256 sum_vec = _mm256_add_ps(a_scaled, b_scaled); + __m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec); + __m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec); + __m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec); __m128i sum_f16 = _mm256_cvtps_ph(sum_vec, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); _mm_storeu_si128((__m128i *)(result + i), sum_f16); } @@ -345,9 +506,72 @@ SIMSIMD_PUBLIC void simsimd_wsum_f16_haswell( // } } +SIMSIMD_PUBLIC void simsimd_add_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, + simsimd_bf16_t *result) { + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + __m128i a_bf16 = _mm_lddqu_si128((__m128i const *)(a + i)); + __m128i b_bf16 = _mm_lddqu_si128((__m128i const *)(b + i)); + __m256 a_vec = _simsimd_bf16x8_to_f32x8_haswell(a_bf16); + __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(b_bf16); + __m256 sum_vec = _mm256_add_ps(a_vec, b_vec); + __m128i sum_bf16 = _simsimd_f32x8_to_bf16x8_haswell(sum_vec); + _mm_storeu_si128((__m128i *)(result + i), sum_bf16); + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i); + simsimd_f32_t bi = SIMSIMD_BF16_TO_F32(b + i); + simsimd_f32_t sum = ai + bi; + SIMSIMD_F32_TO_BF16(sum, result + i); + } +} + +SIMSIMD_PUBLIC void simsimd_scale_bf16_haswell(simsimd_bf16_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_bf16_t *result) { + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); + + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + __m128i a_bf16 = _mm_lddqu_si128((__m128i const *)(a + i)); + __m256 a_vec = _simsimd_bf16x8_to_f32x8_haswell(a_bf16); + __m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec); + __m128i sum_bf16 = _simsimd_f32x8_to_bf16x8_haswell(sum_vec); + _mm_storeu_si128((__m128i *)(result + i), sum_bf16); + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i); + simsimd_f32_t sum = alpha_f32 * ai; + SIMSIMD_F32_TO_BF16(sum, result + i); + } +} + SIMSIMD_PUBLIC void simsimd_wsum_bf16_haswell( // simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result) { + + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_bf16_haswell(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_bf16_haswell(a, n, alpha, result); } + else { simsimd_scale_bf16_haswell(b, n, beta, result); } + return; + } + + // The general case. simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; __m256 alpha_vec = _mm256_set1_ps(alpha_f32); @@ -356,13 +580,13 @@ SIMSIMD_PUBLIC void simsimd_wsum_bf16_haswell( // // The main loop: simsimd_size_t i = 0; for (; i + 8 <= n; i += 8) { - __m128i a_bf16 = _mm_loadu_si128((__m128i const *)(a + i)); - __m128i b_bf16 = _mm_loadu_si128((__m128i const *)(b + i)); + __m128i a_bf16 = _mm_lddqu_si128((__m128i const *)(a + i)); + __m128i b_bf16 = _mm_lddqu_si128((__m128i const *)(b + i)); __m256 a_vec = _simsimd_bf16x8_to_f32x8_haswell(a_bf16); __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(b_bf16); - __m256 a_scaled = _mm256_mul_ps(a_vec, alpha_vec); - __m256 b_scaled = _mm256_mul_ps(b_vec, beta_vec); - __m256 sum_vec = _mm256_add_ps(a_scaled, b_scaled); + __m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec); + __m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec); + __m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec); __m128i sum_bf16 = _simsimd_f32x8_to_bf16x8_haswell(sum_vec); _mm_storeu_si128((__m128i *)(result + i), sum_bf16); } @@ -435,9 +659,9 @@ SIMSIMD_PUBLIC void simsimd_fma_f16_haswell( // // The main loop: simsimd_size_t i = 0; for (; i + 8 <= n; i += 8) { - __m128i a_f16 = _mm_loadu_si128((__m128i const *)(a + i)); - __m128i b_f16 = _mm_loadu_si128((__m128i const *)(b + i)); - __m128i c_f16 = _mm_loadu_si128((__m128i const *)(c + i)); + __m128i a_f16 = _mm_lddqu_si128((__m128i const *)(a + i)); + __m128i b_f16 = _mm_lddqu_si128((__m128i const *)(b + i)); + __m128i c_f16 = _mm_lddqu_si128((__m128i const *)(c + i)); __m256 a_vec = _mm256_cvtph_ps(a_f16); __m256 b_vec = _mm256_cvtph_ps(b_f16); __m256 c_vec = _mm256_cvtph_ps(c_f16); @@ -470,9 +694,9 @@ SIMSIMD_PUBLIC void simsimd_fma_bf16_haswell( / // The main loop: simsimd_size_t i = 0; for (; i + 8 <= n; i += 8) { - __m128i a_bf16 = _mm_loadu_si128((__m128i const *)(a + i)); - __m128i b_bf16 = _mm_loadu_si128((__m128i const *)(b + i)); - __m128i c_bf16 = _mm_loadu_si128((__m128i const *)(c + i)); + __m128i a_bf16 = _mm_lddqu_si128((__m128i const *)(a + i)); + __m128i b_bf16 = _mm_lddqu_si128((__m128i const *)(b + i)); + __m128i c_bf16 = _mm_lddqu_si128((__m128i const *)(c + i)); __m256 a_vec = _simsimd_bf16x8_to_f32x8_haswell(a_bf16); __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(b_bf16); __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(c_bf16); @@ -494,10 +718,88 @@ SIMSIMD_PUBLIC void simsimd_fma_bf16_haswell( / } } +SIMSIMD_PUBLIC void simsimd_add_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, + simsimd_i8_t *result) { + // The main loop: + simsimd_size_t i = 0; + for (; i + 32 <= n; i += 32) { + __m256i a_vec = _mm256_lddqu_si256((__m256i *)(a + i)); + __m256i b_vec = _mm256_lddqu_si256((__m256i *)(b + i)); + __m256i sum_vec = _mm256_adds_epi8(a_vec, b_vec); + _mm256_storeu_si256((__m256i *)(result + i), sum_vec); + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = a[i], bi = b[i]; + simsimd_f32_t sum = ai + bi; + SIMSIMD_F32_TO_U8(sum, result + i); + } +} + +SIMSIMD_PUBLIC void simsimd_scale_i8_haswell(simsimd_i8_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_i8_t *result) { + + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); + int sum_i32s[8], a_i32s[8]; + + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the + //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day. + a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], // + a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7]; + //! This can be done at least 50% faster if we convert 8-bit integers to floats instead + //! of relying on the slow `_mm256_cvtepi32_ps` instruction. + __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); + // The normal part. + __m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec); + // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. + __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); + sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128)); + sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127)); + // Export into a serial buffer. + _mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec); + result[i + 0] = (simsimd_i8_t)sum_i32s[0]; + result[i + 1] = (simsimd_i8_t)sum_i32s[1]; + result[i + 2] = (simsimd_i8_t)sum_i32s[2]; + result[i + 3] = (simsimd_i8_t)sum_i32s[3]; + result[i + 4] = (simsimd_i8_t)sum_i32s[4]; + result[i + 5] = (simsimd_i8_t)sum_i32s[5]; + result[i + 6] = (simsimd_i8_t)sum_i32s[6]; + result[i + 7] = (simsimd_i8_t)sum_i32s[7]; + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = a[i]; + simsimd_f32_t sum = alpha_f32 * ai; + SIMSIMD_F32_TO_I8(sum, result + i); + } +} + SIMSIMD_PUBLIC void simsimd_wsum_i8_haswell( // simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) { + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_i8_haswell(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_i8_haswell(a, n, alpha, result); } + else { simsimd_scale_i8_haswell(b, n, beta, result); } + return; + } + + // The general case. simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; __m256 alpha_vec = _mm256_set1_ps(alpha_f32); @@ -518,9 +820,9 @@ SIMSIMD_PUBLIC void simsimd_wsum_i8_haswell( // __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); __m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s)); // The normal part. - __m256 a_scaled = _mm256_mul_ps(a_vec, alpha_vec); - __m256 b_scaled = _mm256_mul_ps(b_vec, beta_vec); - __m256 sum_vec = _mm256_add_ps(a_scaled, b_scaled); + __m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec); + __m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec); + __m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec); // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128)); @@ -545,10 +847,88 @@ SIMSIMD_PUBLIC void simsimd_wsum_i8_haswell( // } } +SIMSIMD_PUBLIC void simsimd_add_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, + simsimd_u8_t *result) { + // The main loop: + simsimd_size_t i = 0; + for (; i + 32 <= n; i += 32) { + __m256i a_vec = _mm256_lddqu_si256((__m256i *)(a + i)); + __m256i b_vec = _mm256_lddqu_si256((__m256i *)(b + i)); + __m256i sum_vec = _mm256_adds_epu8(a_vec, b_vec); + _mm256_storeu_si256((__m256i *)(result + i), sum_vec); + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = a[i], bi = b[i]; + simsimd_f32_t sum = ai + bi; + SIMSIMD_F32_TO_U8(sum, result + i); + } +} + +SIMSIMD_PUBLIC void simsimd_scale_u8_haswell(simsimd_u8_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_u8_t *result) { + + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); + int sum_i32s[8], a_i32s[8]; + + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the + //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day. + a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], // + a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7]; + //! This can be done at least 50% faster if we convert 8-bit integers to floats instead + //! of relying on the slow `_mm256_cvtepi32_ps` instruction. + __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); + // The normal part. + __m256 sum_vec = _mm256_mul_ps(a_vec, alpha_vec); + // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. + __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); + sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0)); + sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255)); + // Export into a serial buffer. + _mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec); + result[i + 0] = (simsimd_u8_t)sum_i32s[0]; + result[i + 1] = (simsimd_u8_t)sum_i32s[1]; + result[i + 2] = (simsimd_u8_t)sum_i32s[2]; + result[i + 3] = (simsimd_u8_t)sum_i32s[3]; + result[i + 4] = (simsimd_u8_t)sum_i32s[4]; + result[i + 5] = (simsimd_u8_t)sum_i32s[5]; + result[i + 6] = (simsimd_u8_t)sum_i32s[6]; + result[i + 7] = (simsimd_u8_t)sum_i32s[7]; + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = a[i]; + simsimd_f32_t sum = alpha_f32 * ai; + SIMSIMD_F32_TO_U8(sum, result + i); + } +} + SIMSIMD_PUBLIC void simsimd_wsum_u8_haswell( // simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) { + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_u8_haswell(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_u8_haswell(a, n, alpha, result); } + else { simsimd_scale_u8_haswell(b, n, beta, result); } + return; + } + + // The general case. simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; __m256 alpha_vec = _mm256_set1_ps(alpha_f32); @@ -569,9 +949,9 @@ SIMSIMD_PUBLIC void simsimd_wsum_u8_haswell( // __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); __m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s)); // The normal part. - __m256 a_scaled = _mm256_mul_ps(a_vec, alpha_vec); - __m256 b_scaled = _mm256_mul_ps(b_vec, beta_vec); - __m256 sum_vec = _mm256_add_ps(a_scaled, b_scaled); + __m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec); + __m256 b_scaled_vec = _mm256_mul_ps(b_vec, beta_vec); + __m256 sum_vec = _mm256_add_ps(a_scaled_vec, b_scaled_vec); // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0)); @@ -715,14 +1095,73 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_haswell( #pragma GCC target("avx2", "avx512f", "avx512vl", "avx512bw", "bmi2") #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,avx512bw,bmi2"))), apply_to = function) +SIMSIMD_PUBLIC void simsimd_add_f64_skylake(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, + simsimd_f64_t *result) { + __m512d a_vec, b_vec, sum_vec; + __mmask8 mask = 0xFF; +simsimd_add_f64_skylake_cycle: + if (n < 8) { + mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n); + a_vec = _mm512_maskz_loadu_pd(mask, a); + b_vec = _mm512_maskz_loadu_pd(mask, b); + n = 0; + } + else { + a_vec = _mm512_loadu_pd(a); + b_vec = _mm512_loadu_pd(b); + a += 8, b += 8, n -= 8; + } + sum_vec = _mm512_add_pd(a_vec, b_vec); + _mm512_mask_storeu_pd(result, mask, sum_vec); + result += 8; + if (n) goto simsimd_add_f64_skylake_cycle; +} + +SIMSIMD_PUBLIC void simsimd_scale_f64_skylake(simsimd_f64_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_f64_t *result) { + __m512d alpha_vec = _mm512_set1_pd(alpha); + __m512d a_vec, b_vec, a_scaled_vec, sum_vec; + __mmask8 mask = 0xFF; +simsimd_scale_f64_skylake_cycle: + if (n < 8) { + mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n); + a_vec = _mm512_maskz_loadu_pd(mask, a); + n = 0; + } + else { + a_vec = _mm512_loadu_pd(a); + a += 8, n -= 8; + } + sum_vec = _mm512_mul_pd(a_vec, alpha_vec); + _mm512_mask_storeu_pd(result, mask, sum_vec); + result += 8; + if (n) goto simsimd_scale_f64_skylake_cycle; +} + SIMSIMD_PUBLIC void simsimd_wsum_f64_skylake( // simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f64_t *result) { + + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_f64_skylake(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_f64_skylake(a, n, alpha, result); } + else { simsimd_scale_f64_skylake(b, n, beta, result); } + return; + } + + // The general case. __m512d alpha_vec = _mm512_set1_pd(alpha); __m512d beta_vec = _mm512_set1_pd(beta); __m512d a_vec, b_vec, a_scaled_vec, sum_vec; __mmask8 mask = 0xFF; - simsimd_wsum_f64_skylake_cycle: if (n < 8) { mask = (__mmask8)_bzhi_u32(0xFFFFFFFF, n); @@ -742,14 +1181,75 @@ SIMSIMD_PUBLIC void simsimd_wsum_f64_skylake( // if (n) goto simsimd_wsum_f64_skylake_cycle; } +SIMSIMD_PUBLIC void simsimd_add_f32_skylake(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, + simsimd_f32_t *result) { + __m512 a_vec, b_vec, sum_vec; + __mmask16 mask = 0xFFFF; + +simsimd_add_f32_skylake_cycle: + if (n < 16) { + mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n); + a_vec = _mm512_maskz_loadu_ps(mask, a); + b_vec = _mm512_maskz_loadu_ps(mask, b); + n = 0; + } + else { + a_vec = _mm512_loadu_ps(a); + b_vec = _mm512_loadu_ps(b); + a += 16, b += 16, n -= 16; + } + sum_vec = _mm512_add_ps(a_vec, b_vec); + _mm512_mask_storeu_ps(result, mask, sum_vec); + result += 16; + if (n) goto simsimd_add_f32_skylake_cycle; +} + +SIMSIMD_PUBLIC void simsimd_scale_f32_skylake(simsimd_f32_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_f32_t *result) { + __m512 alpha_vec = _mm512_set1_ps(alpha); + __m512 a_vec, sum_vec; + __mmask16 mask = 0xFFFF; + +simsimd_scale_f32_skylake_cycle: + if (n < 16) { + mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n); + a_vec = _mm512_maskz_loadu_ps(mask, a); + n = 0; + } + else { + a_vec = _mm512_loadu_ps(a); + a += 16, n -= 16; + } + sum_vec = _mm512_mul_ps(a_vec, alpha_vec); + _mm512_mask_storeu_ps(result, mask, sum_vec); + result += 16; + if (n) goto simsimd_scale_f32_skylake_cycle; +} + SIMSIMD_PUBLIC void simsimd_wsum_f32_skylake( // simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f32_t *result) { + + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_f32_skylake(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_f32_skylake(a, n, alpha, result); } + else { simsimd_scale_f32_skylake(b, n, beta, result); } + return; + } + + // The general case. __m512 alpha_vec = _mm512_set1_ps(alpha); __m512 beta_vec = _mm512_set1_ps(beta); __m512 a_vec, b_vec, a_scaled_vec, sum_vec; __mmask16 mask = 0xFFFF; - simsimd_wsum_f32_skylake_cycle: if (n < 16) { mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n); @@ -769,15 +1269,81 @@ SIMSIMD_PUBLIC void simsimd_wsum_f32_skylake( // if (n) goto simsimd_wsum_f32_skylake_cycle; } +SIMSIMD_PUBLIC void simsimd_add_bf16_skylake(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, + simsimd_bf16_t *result) { + __m256i a_bf16_vec, b_bf16_vec, sum_bf16_vec; + __m512 a_vec, b_vec, sum_vec; + __mmask16 mask = 0xFFFF; +simsimd_add_bf16_skylake_cycle: + if (n < 16) { + mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n); + a_bf16_vec = _mm256_maskz_loadu_epi16(mask, a); + b_bf16_vec = _mm256_maskz_loadu_epi16(mask, b); + n = 0; + } + else { + a_bf16_vec = _mm256_loadu_epi16(a); + b_bf16_vec = _mm256_loadu_epi16(b); + a += 16, b += 16, n -= 16; + } + a_vec = _simsimd_bf16x16_to_f32x16_skylake(a_bf16_vec); + b_vec = _simsimd_bf16x16_to_f32x16_skylake(b_bf16_vec); + sum_vec = _mm512_add_ps(a_vec, b_vec); + sum_bf16_vec = _simsimd_f32x16_to_bf16x16_skylake(sum_vec); + _mm256_mask_storeu_epi16(result, mask, sum_bf16_vec); + result += 16; + if (n) goto simsimd_add_bf16_skylake_cycle; +} + +SIMSIMD_PUBLIC void simsimd_scale_bf16_skylake(simsimd_bf16_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_bf16_t *result) { + __m512 alpha_vec = _mm512_set1_ps(alpha); + __m256i a_bf16_vec, b_bf16_vec, sum_bf16_vec; + __m512 a_vec, b_vec, sum_vec; + __mmask16 mask = 0xFFFF; +simsimd_wsum_bf16_skylake_cycle: + if (n < 16) { + mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n); + a_bf16_vec = _mm256_maskz_loadu_epi16(mask, a); + n = 0; + } + else { + a_bf16_vec = _mm256_loadu_epi16(a); + a += 16, n -= 16; + } + a_vec = _simsimd_bf16x16_to_f32x16_skylake(a_bf16_vec); + sum_vec = _mm512_mul_ps(a_vec, alpha_vec); + sum_bf16_vec = _simsimd_f32x16_to_bf16x16_skylake(sum_vec); + _mm256_mask_storeu_epi16(result, mask, sum_bf16_vec); + result += 16; + if (n) goto simsimd_wsum_bf16_skylake_cycle; +} + SIMSIMD_PUBLIC void simsimd_wsum_bf16_skylake( // simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_bf16_t *result) { + + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_bf16_skylake(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_bf16_skylake(a, n, alpha, result); } + else { simsimd_scale_bf16_skylake(b, n, beta, result); } + return; + } + + // The general case. __m512 alpha_vec = _mm512_set1_ps(alpha); __m512 beta_vec = _mm512_set1_ps(beta); __m256i a_bf16_vec, b_bf16_vec, sum_bf16_vec; __m512 a_vec, b_vec, a_scaled_vec, sum_vec; __mmask16 mask = 0xFFFF; - simsimd_wsum_bf16_skylake_cycle: if (n < 16) { mask = (__mmask16)_bzhi_u32(0xFFFFFFFF, n); @@ -905,16 +1471,77 @@ SIMSIMD_PUBLIC void simsimd_fma_bf16_skylake( #pragma clang attribute push(__attribute__((target("avx2,avx512f,avx512vl,bmi2,avx512bw,avx512fp16"))), \ apply_to = function) +SIMSIMD_PUBLIC void simsimd_add_f16_sapphire(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, + simsimd_f16_t *result) { + __mmask32 mask = 0xFFFFFFFF; + __m512h a_f16_vec, b_f16_vec; + __m512h sum_f16_vec; +simsimd_add_f16_sapphire_cycle: + if (n < 32) { + mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n); + a_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a)); + b_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b)); + n = 0; + } + else { + a_f16_vec = _mm512_loadu_ph(a); + b_f16_vec = _mm512_loadu_ph(b); + a += 32, b += 32, n -= 32; + } + sum_f16_vec = _mm512_add_ph(a_f16_vec, b_f16_vec); + _mm512_mask_storeu_epi16(result, mask, _mm512_castph_si512(sum_f16_vec)); + result += 32; + if (n) goto simsimd_add_f16_sapphire_cycle; +} + +SIMSIMD_PUBLIC void simsimd_scale_f16_sapphire(simsimd_f16_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_f16_t *result) { + + __mmask32 mask = 0xFFFFFFFF; + __m512h alpha_vec = _mm512_set1_ph((_Float16)alpha); + __m512h a_f16_vec, b_f16_vec; + __m512h sum_f16_vec; +simsimd_scale_f16_sapphire_cycle: + if (n < 32) { + mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n); + a_f16_vec = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a)); + n = 0; + } + else { + a_f16_vec = _mm512_loadu_ph(a); + a += 32, n -= 32; + } + sum_f16_vec = _mm512_mul_ph(a_f16_vec, alpha_vec); + _mm512_mask_storeu_epi16(result, mask, _mm512_castph_si512(sum_f16_vec)); + result += 32; + if (n) goto simsimd_scale_f16_sapphire_cycle; +} + SIMSIMD_PUBLIC void simsimd_wsum_f16_sapphire( // simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_f16_t *result) { + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_f16_sapphire(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_f16_sapphire(a, n, alpha, result); } + else { simsimd_scale_f16_sapphire(b, n, beta, result); } + return; + } + + // The general case. __mmask32 mask = 0xFFFFFFFF; __m512h alpha_vec = _mm512_set1_ph((_Float16)alpha); __m512h beta_vec = _mm512_set1_ph((_Float16)beta); - __m512h a_f16_vec, b_f16_vec, c_f16_vec; + __m512h a_f16_vec, b_f16_vec; __m512h a_scaled_f16_vec, sum_f16_vec; - simsimd_wsum_f16_sapphire_cycle: if (n < 32) { mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n); @@ -943,7 +1570,6 @@ SIMSIMD_PUBLIC void simsimd_fma_f16_sapphire( __m512h beta_vec = _mm512_set1_ph((_Float16)beta); __m512h a_f16_vec, b_f16_vec, c_f16_vec; __m512h ab_f16_vec, ab_scaled_f16_vec, sum_f16_vec; - simsimd_fma_f16_sapphire_cycle: if (n < 32) { mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, n); @@ -966,18 +1592,88 @@ SIMSIMD_PUBLIC void simsimd_fma_f16_sapphire( if (n) goto simsimd_fma_f16_sapphire_cycle; } +SIMSIMD_PUBLIC void simsimd_add_u8_sapphire(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, + simsimd_u8_t *result) { + __mmask64 mask = 0xFFFFFFFFFFFFFFFFull; + __m512i a_u8_vec, b_u8_vec, sum_u8_vec; +simsimd_add_u8_sapphire_cycle: + if (n < 64) { + mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); + a_u8_vec = _mm512_maskz_loadu_epi8(mask, a); + b_u8_vec = _mm512_maskz_loadu_epi8(mask, b); + n = 0; + } + else { + a_u8_vec = _mm512_loadu_epi8(a); + b_u8_vec = _mm512_loadu_epi8(b); + a += 64, b += 64, n -= 64; + } + sum_u8_vec = _mm512_adds_epu8(a_u8_vec, b_u8_vec); + _mm512_mask_storeu_epi8(result, mask, sum_u8_vec); + result += 64; + if (n) goto simsimd_add_u8_sapphire_cycle; +} + +SIMSIMD_PUBLIC void simsimd_scale_u8_sapphire(simsimd_u8_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_u8_t *result) { + __mmask64 mask = 0xFFFFFFFFFFFFFFFFull; + __m512h alpha_vec = _mm512_set1_ph((_Float16)alpha); + __m512i a_u8_vec, b_u8_vec, sum_u8_vec; + __m512h a_f16_low_vec, a_f16_high_vec; + __m512h a_scaled_f16_low_vec, a_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec; + __m512i sum_i16_low_vec, sum_i16_high_vec; +simsimd_scale_u8_sapphire_cycle: + if (n < 64) { + mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); + a_u8_vec = _mm512_maskz_loadu_epi8(mask, a); + n = 0; + } + else { + a_u8_vec = _mm512_loadu_epi8(a); + a += 64, n -= 64; + } + // Upcast: + a_f16_low_vec = _mm512_cvtepi16_ph(_mm512_unpacklo_epi8(a_u8_vec, _mm512_setzero_si512())); + a_f16_high_vec = _mm512_cvtepi16_ph(_mm512_unpackhi_epi8(a_u8_vec, _mm512_setzero_si512())); + // Scale: + sum_f16_low_vec = _mm512_mul_ph(a_f16_low_vec, alpha_vec); + sum_f16_high_vec = _mm512_mul_ph(a_f16_high_vec, alpha_vec); + // Downcast: + sum_i16_low_vec = _mm512_cvtph_epi16(sum_f16_low_vec); + sum_i16_high_vec = _mm512_cvtph_epi16(sum_f16_high_vec); + sum_u8_vec = _mm512_packus_epi16(sum_i16_low_vec, sum_i16_high_vec); + _mm512_mask_storeu_epi8(result, mask, sum_u8_vec); + result += 64; + if (n) goto simsimd_scale_u8_sapphire_cycle; +} + SIMSIMD_PUBLIC void simsimd_wsum_u8_sapphire( // simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) { + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_u8_sapphire(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_u8_sapphire(a, n, alpha, result); } + else { simsimd_scale_u8_sapphire(b, n, beta, result); } + return; + } + + // The general case. __mmask64 mask = 0xFFFFFFFFFFFFFFFFull; __m512h alpha_vec = _mm512_set1_ph((_Float16)alpha); __m512h beta_vec = _mm512_set1_ph((_Float16)beta); - __m512i a_u8_vec, b_u8_vec, c_u8_vec, sum_u8_vec; + __m512i a_u8_vec, b_u8_vec, sum_u8_vec; __m512h a_f16_low_vec, a_f16_high_vec, b_f16_low_vec, b_f16_high_vec; __m512h a_scaled_f16_low_vec, a_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec; __m512i sum_i16_low_vec, sum_i16_high_vec; - simsimd_wsum_u8_sapphire_cycle: if (n < 64) { mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); @@ -1010,14 +1706,92 @@ SIMSIMD_PUBLIC void simsimd_wsum_u8_sapphire( // if (n) goto simsimd_wsum_u8_sapphire_cycle; } +SIMSIMD_PUBLIC void simsimd_add_i8_sapphire(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, + simsimd_i8_t *result) { + + __mmask64 mask = 0xFFFFFFFFFFFFFFFFull; + __m512i a_i8_vec, b_i8_vec, sum_i8_vec; + __m512h a_f16_low_vec, a_f16_high_vec, b_f16_low_vec, b_f16_high_vec; + __m512h a_scaled_f16_low_vec, a_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec; + __m512i sum_i16_low_vec, sum_i16_high_vec; + +simsimd_add_i8_sapphire_cycle: + if (n < 64) { + mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); + a_i8_vec = _mm512_maskz_loadu_epi8(mask, a); + b_i8_vec = _mm512_maskz_loadu_epi8(mask, b); + n = 0; + } + else { + a_i8_vec = _mm512_loadu_epi8(a); + b_i8_vec = _mm512_loadu_epi8(b); + a += 64, b += 64, n -= 64; + } + sum_i8_vec = _mm512_adds_epi8(a_i8_vec, b_i8_vec); + _mm512_mask_storeu_epi8(result, mask, sum_i8_vec); + result += 64; + if (n) goto simsimd_add_i8_sapphire_cycle; +} + +SIMSIMD_PUBLIC void simsimd_scale_i8_sapphire(simsimd_i8_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_i8_t *result) { + + __mmask64 mask = 0xFFFFFFFFFFFFFFFFull; + __m512h alpha_vec = _mm512_set1_ph((_Float16)alpha); + __m512i a_i8_vec, sum_i8_vec; + __m512h a_f16_low_vec, a_f16_high_vec; + __m512h sum_f16_low_vec, sum_f16_high_vec; + __m512i sum_i16_low_vec, sum_i16_high_vec; +simsimd_wsum_i8_sapphire_cycle: + if (n < 64) { + mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); + a_i8_vec = _mm512_maskz_loadu_epi8(mask, a); + n = 0; + } + else { + a_i8_vec = _mm512_loadu_epi8(a); + a += 64, n -= 64; + } + // Upcast: + a_f16_low_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(a_i8_vec))); + a_f16_high_vec = _mm512_cvtepi16_ph(_mm512_cvtepi8_epi16(_mm512_extracti64x4_epi64(a_i8_vec, 1))); + // Scale: + sum_f16_low_vec = _mm512_mul_ph(a_f16_low_vec, alpha_vec); + sum_f16_high_vec = _mm512_mul_ph(a_f16_high_vec, alpha_vec); + // Downcast: + sum_i16_low_vec = _mm512_cvtph_epi16(sum_f16_low_vec); + sum_i16_high_vec = _mm512_cvtph_epi16(sum_f16_high_vec); + sum_i8_vec = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtsepi16_epi8(sum_i16_low_vec)), + _mm512_cvtsepi16_epi8(sum_i16_high_vec), 1); + _mm512_mask_storeu_epi8(result, mask, sum_i8_vec); + result += 64; + if (n) goto simsimd_wsum_i8_sapphire_cycle; +} + SIMSIMD_PUBLIC void simsimd_wsum_i8_sapphire( // simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, // simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) { + // There are are several special cases we may want to implement: + // 1. Simple addition, when both weights are equal to 1.0. + if (alpha == 1 && beta == 1) { + // In this case we can avoid expensive multiplications. + simsimd_add_i8_sapphire(a, b, n, result); + return; + } + // 2. Just scaling, when one of the weights is equal to zero. + else if (alpha == 0 || beta == 0) { + // In this case we can avoid half of the load instructions. + if (beta == 0) { simsimd_scale_i8_sapphire(a, n, alpha, result); } + else { simsimd_scale_i8_sapphire(b, n, beta, result); } + return; + } + + // The general case. __mmask64 mask = 0xFFFFFFFFFFFFFFFFull; __m512h alpha_vec = _mm512_set1_ph((_Float16)alpha); __m512h beta_vec = _mm512_set1_ph((_Float16)beta); - __m512i a_i8_vec, b_i8_vec, c_i8_vec, sum_i8_vec; + __m512i a_i8_vec, b_i8_vec, sum_i8_vec; __m512h a_f16_low_vec, a_f16_high_vec, b_f16_low_vec, b_f16_high_vec; __m512h a_scaled_f16_low_vec, a_scaled_f16_high_vec, sum_f16_low_vec, sum_f16_high_vec; __m512i sum_i16_low_vec, sum_i16_high_vec;