Skip to content

Commit

Permalink
Add: Preceise calculation mode
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Oct 2, 2023
1 parent 53eecc3 commit b213ffb
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 103 deletions.
64 changes: 32 additions & 32 deletions bench.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ static void measure(bm::State& state, metric_at metric, metric_at baseline) {
state.counters["pairs"] = bm::Counter(iterations, bm::Counter::kIsRate);

double c_baseline = baseline(pair.a, pair.b, pair.dimensions());
double delta = std::abs(c - c_baseline) / std::abs(c_baseline);
if (delta < 0.01)
double delta = std::abs(c - c_baseline);
if (delta < 0.001)
delta = 0;
state.counters["delta"] = delta;
}
Expand Down Expand Up @@ -105,53 +105,53 @@ int main(int argc, char** argv) {
return 1;

#if SIMSIMD_TARGET_ARM_NEON
register_<simsimd_f16_t>("neon_f16_ip", simsimd_neon_f16_ip, simsimd_auto_f16_ip);
register_<simsimd_f16_t>("neon_f16_cos", simsimd_neon_f16_cos, simsimd_auto_f16_cos);
register_<simsimd_f16_t>("neon_f16_l2sq", simsimd_neon_f16_l2sq, simsimd_auto_f16_l2sq);
register_<simsimd_f16_t>("neon_f16_ip", simsimd_neon_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("neon_f16_cos", simsimd_neon_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("neon_f16_l2sq", simsimd_neon_f16_l2sq, simsimd_accurate_f16_l2sq);

register_<simsimd_f32_t>("neon_f32_ip", simsimd_neon_f32_ip, simsimd_auto_f32_ip);
register_<simsimd_f32_t>("neon_f32_cos", simsimd_neon_f32_cos, simsimd_auto_f32_cos);
register_<simsimd_f32_t>("neon_f32_l2sq", simsimd_neon_f32_l2sq, simsimd_auto_f32_l2sq);
register_<simsimd_f32_t>("neon_f32_ip", simsimd_neon_f32_ip, simsimd_accurate_f32_ip);
register_<simsimd_f32_t>("neon_f32_cos", simsimd_neon_f32_cos, simsimd_accurate_f32_cos);
register_<simsimd_f32_t>("neon_f32_l2sq", simsimd_neon_f32_l2sq, simsimd_accurate_f32_l2sq);

register_<simsimd_i8_t>("neon_i8_cos", simsimd_neon_i8_cos, simsimd_auto_i8_cos);
register_<simsimd_i8_t>("neon_i8_l2sq", simsimd_neon_i8_l2sq, simsimd_auto_i8_l2sq);
register_<simsimd_i8_t>("neon_i8_cos", simsimd_neon_i8_cos, simsimd_accurate_i8_cos);
register_<simsimd_i8_t>("neon_i8_l2sq", simsimd_neon_i8_l2sq, simsimd_accurate_i8_l2sq);
#endif

#if SIMSIMD_TARGET_ARM_SVE
register_<simsimd_f16_t>("sve_f16_ip", simsimd_sve_f16_ip, simsimd_auto_f16_ip);
register_<simsimd_f16_t>("sve_f16_cos", simsimd_sve_f16_cos, simsimd_auto_f16_cos);
register_<simsimd_f16_t>("sve_f16_l2sq", simsimd_sve_f16_l2sq, simsimd_auto_f16_l2sq);
register_<simsimd_f16_t>("sve_f16_ip", simsimd_sve_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("sve_f16_cos", simsimd_sve_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("sve_f16_l2sq", simsimd_sve_f16_l2sq, simsimd_accurate_f16_l2sq);

register_<simsimd_f32_t>("sve_f32_ip", simsimd_sve_f32_ip, simsimd_auto_f32_ip);
register_<simsimd_f32_t>("sve_f32_cos", simsimd_sve_f32_cos, simsimd_auto_f32_cos);
register_<simsimd_f32_t>("sve_f32_l2sq", simsimd_sve_f32_l2sq, simsimd_auto_f32_l2sq);
register_<simsimd_f32_t>("sve_f32_ip", simsimd_sve_f32_ip, simsimd_accurate_f32_ip);
register_<simsimd_f32_t>("sve_f32_cos", simsimd_sve_f32_cos, simsimd_accurate_f32_cos);
register_<simsimd_f32_t>("sve_f32_l2sq", simsimd_sve_f32_l2sq, simsimd_accurate_f32_l2sq);
#endif

#if SIMSIMD_TARGET_X86_AVX2
register_<simsimd_f16_t>("avx2_f16_ip", simsimd_avx2_f16_ip, simsimd_auto_f16_ip);
register_<simsimd_f16_t>("avx2_f16_cos", simsimd_avx2_f16_cos, simsimd_auto_f16_cos);
register_<simsimd_f16_t>("avx2_f16_l2sq", simsimd_avx2_f16_l2sq, simsimd_auto_f16_l2sq);
register_<simsimd_f16_t>("avx2_f16_ip", simsimd_avx2_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("avx2_f16_cos", simsimd_avx2_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("avx2_f16_l2sq", simsimd_avx2_f16_l2sq, simsimd_accurate_f16_l2sq);

register_<simsimd_i8_t>("avx2_i8_cos", simsimd_avx2_i8_cos, simsimd_auto_i8_cos);
register_<simsimd_i8_t>("avx2_i8_l2sq", simsimd_avx2_i8_l2sq, simsimd_auto_i8_l2sq);
register_<simsimd_i8_t>("avx2_i8_cos", simsimd_avx2_i8_cos, simsimd_accurate_i8_cos);
register_<simsimd_i8_t>("avx2_i8_l2sq", simsimd_avx2_i8_l2sq, simsimd_accurate_i8_l2sq);
#endif

#if SIMSIMD_TARGET_X86_AVX512
register_<simsimd_f16_t>("avx512_f16_ip", simsimd_avx512_f16_ip, simsimd_auto_f16_ip);
register_<simsimd_f16_t>("avx512_f16_cos", simsimd_avx512_f16_cos, simsimd_auto_f16_cos);
register_<simsimd_f16_t>("avx512_f16_l2sq", simsimd_avx512_f16_l2sq, simsimd_auto_f16_l2sq);
register_<simsimd_f16_t>("avx512_f16_ip", simsimd_avx512_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("avx512_f16_cos", simsimd_avx512_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("avx512_f16_l2sq", simsimd_avx512_f16_l2sq, simsimd_accurate_f16_l2sq);
#endif

register_<simsimd_f16_t>("auto_f16_ip", simsimd_auto_f16_ip, simsimd_auto_f16_ip);
register_<simsimd_f16_t>("auto_f16_cos", simsimd_auto_f16_cos, simsimd_auto_f16_cos);
register_<simsimd_f16_t>("auto_f16_l2sq", simsimd_auto_f16_l2sq, simsimd_auto_f16_l2sq);
register_<simsimd_f16_t>("auto_f16_ip", simsimd_auto_f16_ip, simsimd_accurate_f16_ip);
register_<simsimd_f16_t>("auto_f16_cos", simsimd_auto_f16_cos, simsimd_accurate_f16_cos);
register_<simsimd_f16_t>("auto_f16_l2sq", simsimd_auto_f16_l2sq, simsimd_accurate_f16_l2sq);

register_<simsimd_f32_t>("auto_f32_ip", simsimd_auto_f32_ip, simsimd_auto_f32_ip);
register_<simsimd_f32_t>("auto_f32_cos", simsimd_auto_f32_cos, simsimd_auto_f32_cos);
register_<simsimd_f32_t>("auto_f32_l2sq", simsimd_auto_f32_l2sq, simsimd_auto_f32_l2sq);
register_<simsimd_f32_t>("auto_f32_ip", simsimd_auto_f32_ip, simsimd_accurate_f32_ip);
register_<simsimd_f32_t>("auto_f32_cos", simsimd_auto_f32_cos, simsimd_accurate_f32_cos);
register_<simsimd_f32_t>("auto_f32_l2sq", simsimd_auto_f32_l2sq, simsimd_accurate_f32_l2sq);

register_<simsimd_i8_t>("auto_i8_cos", simsimd_auto_i8_cos, simsimd_auto_i8_cos);
register_<simsimd_i8_t>("auto_i8_l2sq", simsimd_auto_i8_l2sq, simsimd_auto_i8_l2sq);
register_<simsimd_i8_t>("auto_i8_cos", simsimd_auto_i8_cos, simsimd_accurate_i8_cos);
register_<simsimd_i8_t>("auto_i8_l2sq", simsimd_auto_i8_l2sq, simsimd_accurate_i8_l2sq);

bm::RunSpecifiedBenchmarks();
bm::Shutdown();
Expand Down
67 changes: 41 additions & 26 deletions include/simsimd/autovec.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,59 +4,74 @@
extern "C" {
#endif

#define SIMSIMD_AUTO_L2SQ(type) \
inline static simsimd_f32_t simsimd_auto_##type##_l2sq(simsimd_##type##_t const* a, simsimd_##type##_t const* b, \
simsimd_size_t d) { \
simsimd_f32_t d2 = 0; \
#define SIMSIMD_AUTO_L2SQ(name, input_type, accumulator_type) \
inline static simsimd_f32_t simsimd_##name##_##input_type##_l2sq( \
simsimd_##input_type##_t const* a, simsimd_##input_type##_t const* b, simsimd_size_t d) { \
simsimd_##accumulator_type##_t d2 = 0; \
for (simsimd_size_t i = 0; i != d; ++i) { \
simsimd_f32_t ai = a[i]; \
simsimd_f32_t bi = b[i]; \
simsimd_##accumulator_type##_t ai = a[i]; \
simsimd_##accumulator_type##_t bi = b[i]; \
d2 += (ai - bi) * (ai - bi); \
} \
return d2; \
}

#define SIMSIMD_AUTO_IP(type) \
inline static simsimd_f32_t simsimd_auto_##type##_ip(simsimd_##type##_t const* a, simsimd_##type##_t const* b, \
simsimd_size_t d) { \
simsimd_f32_t ab = 0; \
#define SIMSIMD_AUTO_IP(name, input_type, accumulator_type) \
inline static simsimd_f32_t simsimd_##name##_##input_type##_ip( \
simsimd_##input_type##_t const* a, simsimd_##input_type##_t const* b, simsimd_size_t d) { \
simsimd_##accumulator_type##_t ab = 0; \
for (simsimd_size_t i = 0; i != d; ++i) { \
simsimd_f32_t ai = a[i]; \
simsimd_f32_t bi = b[i]; \
simsimd_##accumulator_type##_t ai = a[i]; \
simsimd_##accumulator_type##_t bi = b[i]; \
ab += ai * bi; \
} \
return 1 - ab; \
}

#define SIMSIMD_AUTO_COS(type) \
inline static simsimd_f32_t simsimd_auto_##type##_cos(simsimd_##type##_t const* a, simsimd_##type##_t const* b, \
simsimd_size_t d) { \
simsimd_f32_t ab = 0, a2 = 0, b2 = 0; \
#define SIMSIMD_AUTO_COS(name, input_type, accumulator_type) \
inline static simsimd_f32_t simsimd_##name##_##input_type##_cos( \
simsimd_##input_type##_t const* a, simsimd_##input_type##_t const* b, simsimd_size_t d) { \
simsimd_##accumulator_type##_t ab = 0, a2 = 0, b2 = 0; \
for (simsimd_size_t i = 0; i != d; ++i) { \
simsimd_f32_t ai = a[i]; \
simsimd_f32_t bi = b[i]; \
simsimd_##accumulator_type##_t ai = a[i]; \
simsimd_##accumulator_type##_t bi = b[i]; \
ab += ai * bi; \
a2 += ai * ai; \
b2 += bi * bi; \
} \
return 1 - ab * simsimd_approximate_inverse_square_root(a2 * b2); \
}

SIMSIMD_AUTO_L2SQ(f32)
SIMSIMD_AUTO_IP(f32)
SIMSIMD_AUTO_COS(f32)
SIMSIMD_AUTO_L2SQ(auto, f32, f32)
SIMSIMD_AUTO_IP(auto, f32, f32)
SIMSIMD_AUTO_COS(auto, f32, f32)

SIMSIMD_AUTO_L2SQ(f16)
SIMSIMD_AUTO_IP(f16)
SIMSIMD_AUTO_COS(f16)
SIMSIMD_AUTO_L2SQ(auto, f16, f32)
SIMSIMD_AUTO_IP(auto, f16, f32)
SIMSIMD_AUTO_COS(auto, f16, f32)

SIMSIMD_AUTO_L2SQ(i8)
SIMSIMD_AUTO_COS(i8)
SIMSIMD_AUTO_L2SQ(auto, i8, f32)
SIMSIMD_AUTO_COS(auto, i8, f32)

inline static simsimd_f32_t simsimd_auto_i8_ip(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) {
return simsimd_auto_i8_cos(a, b, d);
}

SIMSIMD_AUTO_L2SQ(accurate, f32, f64)
SIMSIMD_AUTO_IP(accurate, f32, f64)
SIMSIMD_AUTO_COS(accurate, f32, f64)

SIMSIMD_AUTO_L2SQ(accurate, f16, f64)
SIMSIMD_AUTO_IP(accurate, f16, f64)
SIMSIMD_AUTO_COS(accurate, f16, f64)

SIMSIMD_AUTO_L2SQ(accurate, i8, f64)
SIMSIMD_AUTO_COS(accurate, i8, f64)

inline static simsimd_f32_t simsimd_accurate_i8_ip(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) {
return simsimd_accurate_i8_cos(a, b, d);
}

#ifdef __cplusplus
} // extern "C"
#endif
3 changes: 2 additions & 1 deletion include/simsimd/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ typedef union {
* @brief Computes `1/sqrt(x)` using the trick from Quake 3.
*/
inline static float simsimd_approximate_inverse_square_root(float number) {
simsimd_f32i32_t conv = {.f = number};
simsimd_f32i32_t conv;
conv.f = number;
conv.i = 0x5f3759df - (conv.i >> 1);
conv.f *= 1.5F - (number * 0.5F * conv.f * conv.f);
return conv.f;
Expand Down
82 changes: 41 additions & 41 deletions include/simsimd/x86_avx2_f16.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,81 +16,81 @@
extern "C" {
#endif

inline static simsimd_f32_t simsimd_avx2_f16_l2sq(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t d) {
__m256 sum_vec = _mm256_set1_ps(0);
inline static simsimd_f32_t simsimd_avx2_f16_l2sq(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n) {
__m256 d2_vec = _mm256_set1_ps(0);
simsimd_size_t i = 0;
for (; i + 8 <= d; i += 8) {
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(a + i)));
__m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + i)));
__m256 diff_vec = _mm256_sub_ps(a_vec, b_vec);
sum_vec = _mm256_fmadd_ps(diff_vec, diff_vec, sum_vec);
__m256 d_vec = _mm256_sub_ps(a_vec, b_vec);
d2_vec = _mm256_fmadd_ps(d_vec, d_vec, d2_vec);
}
sum_vec = _mm256_add_ps(_mm256_permute2f128_ps(sum_vec, sum_vec, 0x81), sum_vec);
sum_vec = _mm256_hadd_ps(sum_vec, sum_vec);
sum_vec = _mm256_hadd_ps(sum_vec, sum_vec);

d2_vec = _mm256_add_ps(_mm256_permute2f128_ps(d2_vec, d2_vec, 1), d2_vec);
d2_vec = _mm256_hadd_ps(d2_vec, d2_vec);
d2_vec = _mm256_hadd_ps(d2_vec, d2_vec);

simsimd_f32_t result[1];
_mm_store_ss(result, _mm256_castps256_ps128(sum_vec));
_mm_store_ss(result, _mm256_castps256_ps128(d2_vec));

// Accumulate the tail:
for (; i < d; ++i) {
simsimd_f32_t diff = a[i] - b[i];
result[0] += diff * diff;
}
for (; i < n; ++i)
result[0] += (a[i] - b[i]) * (a[i] - b[i]);
return result[0];
}

inline static simsimd_f32_t simsimd_avx2_f16_ip(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t d) {
__m256 sum_vec = _mm256_set1_ps(0);
inline static simsimd_f32_t simsimd_avx2_f16_ip(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n) {
__m256 ab_vec = _mm256_set1_ps(0);
simsimd_size_t i = 0;
for (; i + 8 <= d; i += 8) {
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(a + i)));
__m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + i)));
sum_vec = _mm256_fmadd_ps(a_vec, b_vec, sum_vec);
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
}

sum_vec = _mm256_add_ps(_mm256_permute2f128_ps(sum_vec, sum_vec, 0x81), sum_vec);
sum_vec = _mm256_hadd_ps(sum_vec, sum_vec);
sum_vec = _mm256_hadd_ps(sum_vec, sum_vec);
ab_vec = _mm256_add_ps(_mm256_permute2f128_ps(ab_vec, ab_vec, 1), ab_vec);
ab_vec = _mm256_hadd_ps(ab_vec, ab_vec);
ab_vec = _mm256_hadd_ps(ab_vec, ab_vec);

simsimd_f32_t result[1];
_mm_store_ss(result, _mm256_castps256_ps128(sum_vec));
_mm_store_ss(result, _mm256_castps256_ps128(ab_vec));

// Accumulate the tail:
for (; i < d; ++i)
for (; i < n; ++i)
result[0] += a[i] * b[i];
return result[0];
}

inline static simsimd_f32_t simsimd_avx2_f16_cos(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t d) {
__m256 sum_ab = _mm256_set1_ps(0), sum_a2 = _mm256_set1_ps(0), sum_b2 = _mm256_set1_ps(0);
inline static simsimd_f32_t simsimd_avx2_f16_cos(simsimd_f16_t const* a, simsimd_f16_t const* b, simsimd_size_t n) {
__m256 ab_vec = _mm256_set1_ps(0), a2_vec = _mm256_set1_ps(0), b2_vec = _mm256_set1_ps(0);
simsimd_size_t i = 0;
for (; i + 8 <= d; i += 8) {
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(a + i)));
__m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + i)));
sum_ab = _mm256_fmadd_ps(a_vec, b_vec, sum_ab);
sum_a2 = _mm256_fmadd_ps(a_vec, a_vec, sum_a2);
sum_b2 = _mm256_fmadd_ps(b_vec, b_vec, sum_b2);
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
a2_vec = _mm256_fmadd_ps(a_vec, a_vec, a2_vec);
b2_vec = _mm256_fmadd_ps(b_vec, b_vec, b2_vec);
}

sum_ab = _mm256_add_ps(_mm256_permute2f128_ps(sum_ab, sum_ab, 0x81), sum_ab);
sum_ab = _mm256_hadd_ps(sum_ab, sum_ab);
sum_ab = _mm256_hadd_ps(sum_ab, sum_ab);
ab_vec = _mm256_add_ps(_mm256_permute2f128_ps(ab_vec, ab_vec, 1), ab_vec);
ab_vec = _mm256_hadd_ps(ab_vec, ab_vec);
ab_vec = _mm256_hadd_ps(ab_vec, ab_vec);

sum_a2 = _mm256_add_ps(_mm256_permute2f128_ps(sum_a2, sum_a2, 0x81), sum_a2);
sum_a2 = _mm256_hadd_ps(sum_a2, sum_a2);
sum_a2 = _mm256_hadd_ps(sum_a2, sum_a2);
a2_vec = _mm256_add_ps(_mm256_permute2f128_ps(a2_vec, a2_vec, 1), a2_vec);
a2_vec = _mm256_hadd_ps(a2_vec, a2_vec);
a2_vec = _mm256_hadd_ps(a2_vec, a2_vec);

sum_b2 = _mm256_add_ps(_mm256_permute2f128_ps(sum_b2, sum_b2, 0x81), sum_b2);
sum_b2 = _mm256_hadd_ps(sum_b2, sum_b2);
sum_b2 = _mm256_hadd_ps(sum_b2, sum_b2);
b2_vec = _mm256_add_ps(_mm256_permute2f128_ps(b2_vec, b2_vec, 1), b2_vec);
b2_vec = _mm256_hadd_ps(b2_vec, b2_vec);
b2_vec = _mm256_hadd_ps(b2_vec, b2_vec);

simsimd_f32_t result[3];
_mm_store_ss(result, _mm256_castps256_ps128(sum_ab));
_mm_store_ss(result + 1, _mm256_castps256_ps128(sum_a2));
_mm_store_ss(result + 2, _mm256_castps256_ps128(sum_b2));
_mm_store_ss(result, _mm256_castps256_ps128(ab_vec));
_mm_store_ss(result + 1, _mm256_castps256_ps128(a2_vec));
_mm_store_ss(result + 2, _mm256_castps256_ps128(b2_vec));

// Accumulate the tail:
for (; i < d; ++i)
for (; i < n; ++i)
result[0] += a[i] * b[i], result[1] += a[i] * a[i], result[2] += b[i] * b[i];
return result[0] * simsimd_approximate_inverse_square_root(result[1] * result[2]);
}
Expand Down
Loading

0 comments on commit b213ffb

Please sign in to comment.