diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h index 9d3eebc3..ced61c84 100644 --- a/include/simsimd/simsimd.h +++ b/include/simsimd/simsimd.h @@ -102,14 +102,15 @@ #define SIMSIMD_DYNAMIC_DISPATCH (0) // true or false #endif -#include "binary.h" // Hamming, Jaccard -#include "curved.h" // Mahalanobis, Bilinear Forms -#include "dot.h" // Inner (dot) product, and its conjugate -#include "elementwise.h" // Weighted Sum, Fused-Multiply-Add -#include "geospatial.h" // Haversine and Vincenty -#include "probability.h" // Kullback-Leibler, Jensen–Shannon -#include "sparse.h" // Intersect -#include "spatial.h" // L2, Angular distances +#include "binary.h" // Hamming, Jaccard +#include "curved.h" // Mahalanobis, Bilinear Forms +#include "dot.h" // Inner (dot) product, and its conjugate +#include "elementwise.h" // Weighted Sum, Fused-Multiply-Add +#include "geospatial.h" // Haversine and Vincenty +#include "probability.h" // Kullback-Leibler, Jensen–Shannon +#include "sparse.h" // Intersect +#include "spatial.h" // L2, Angular distances +#include "trigonometry.h" // Sine, Cosine // On Apple Silicon, `mrs` is not allowed in user-space, so we need to use the `sysctl` API. #if defined(_SIMSIMD_DEFINED_APPLE) @@ -128,11 +129,11 @@ typedef enum { simsimd_kernel_unknown_k = 0, ///< Unknown kernel // Classics: - simsimd_dot_k = 'i', ///< Inner product - simsimd_vdot_k = 'v', ///< Complex inner product - simsimd_cos_k = 'c', ///< Cosine similarity - simsimd_l2_k = '2', ///< Euclidean distance alias - simsimd_l2sq_k = 'e', ///< Squared Euclidean distance + simsimd_dot_k = 'i', ///< Inner product + simsimd_vdot_k = 'v', ///< Complex inner product + simsimd_angular_k = 'a', ///< Angular (cosine) similarity + simsimd_l2_k = 'e', ///< Euclidean distance alias + simsimd_l2sq_k = '2', ///< Squared Euclidean distance // Binary: simsimd_hamming_k = 'h', ///< Hamming distance @@ -159,11 +160,9 @@ typedef enum { simsimd_fma_k = 'f', ///< Fused Multiply-Add // Aliases: - simsimd_cosine_k = 'c', ///< Cosine similarity alias - simsimd_angular_k = 'c', ///< Cosine similarity alias simsimd_inner_k = 'i', ///< Inner product alias - simsimd_euclidean_k = '2', ///< Euclidean distance alias - simsimd_sqeuclidean_k = 'e', ///< Squared Euclidean distance alias + simsimd_euclidean_k = 'e', ///< Euclidean distance alias + simsimd_sqeuclidean_k = '2', ///< Squared Euclidean distance alias simsimd_manhattan_k = 'h', ///< Manhattan distance is same as Hamming simsimd_tanimoto_k = 'j', ///< Tanimoto coefficient is same as Jaccard simsimd_kullback_leibler_k = 'k', ///< Kullback-Leibler divergence alias diff --git a/include/simsimd/trigonometry.h b/include/simsimd/trigonometry.h new file mode 100644 index 00000000..1c5b1260 --- /dev/null +++ b/include/simsimd/trigonometry.h @@ -0,0 +1,363 @@ +/** + * @file trigonometry.h + * @brief SIMD-accelerated trigonometric element-wise oeprations. + * @author Ash Vardanian + * @date July 1, 2023 + * + * Contains: + * - Sine and Cosine approximations: fast for `f32` vs accurate for `f64` + * + * For datatypes: + * - 32-bit IEEE-754 floating point + * - 64-bit IEEE-754 floating point + * + * For hardware architectures: + * - Arm: NEON + * - x86: Haswell, Skylake + * + * Those functions partially complement the `elementwise.h` module, and are necessary for + * the `geospatial.h` module, among others. Both Haversine and Vincenty's formulas require + * trigonometric functions, and those are the most expensive part of the computation. + * + * @section GLibC IEEE-754-compliant Math Functions + * + * The GNU C Library (GLibC) provides a set of IEEE-754-compliant math functions, like `sinf`, `cosf`, + * and double-precsion variants `sin`, `cos`. Those functions are accurate to ~0.55 ULP (units in the + * last place), but can be slow to evaluate. They use a combination of techniques, like: + * + * - Taylor series expansions for small values. + * - Table lookups combined with corrections for moderate values. + * - Accurate modulo reduction for large values. + * + * The precomputed tables may be the hardest part to accelerate with SIMD, as they contain 440x values, + * each 64-bit wide. + * + * https://github.com/lattera/glibc/blob/895ef79e04a953cac1493863bcae29ad85657ee1/sysdeps/ieee754/dbl-64/branred.c#L54 + * https://github.com/lattera/glibc/blob/895ef79e04a953cac1493863bcae29ad85657ee1/sysdeps/ieee754/dbl-64/s_sin.c#L84 + * + * @section Approximation Algorithms + * + * There are several ways to approximate trigonometric functions, and the choice depends on the + * target hardware and the desired precision. Notably: + * + * - Taylor Series approximation is a series expansion of a sum of its derivatives at a target point. + * It's easy to derive for differentiable functions, works well for functions smooth around the + * expsansion point, but can perform poorly for functions with singularities or high-frequency + * oscillations. + * + * - Pade approximations are rational functions that approximate a function by a ratio of polynomials. + * It often converges faster than Taylor for functions with singularities or steep changes, provides + * good approximations for both smooth and rational functions, but can be more computationally + * intensive to evaluate, and can have holes (undefined points). + * + * Moreover, most approximations can be combined with Horner's methods of evaluating polynomials + * to reduce the number of multiplications and additions, and to improve the numerical stability. + * In trigonometry, the Payne-Hanek Range Reduction is another technique used to reduce the argument + * to a smaller range, where the approximation is more accurate. + * + * x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/ + * Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/ + */ +#ifndef SIMSIMD_TRIGONOMETRY_H +#define SIMSIMD_TRIGONOMETRY_H + +#include "types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Serial backends for all numeric types. + * By default they use 32-bit arithmetic, unless the arguments themselves contain 64-bit floats. + * For double-precision computation check out the "*_accurate" variants of those "*_serial" functions. + */ +SIMSIMD_PUBLIC void simsimd_sin_f64_serial(simsimd_f64_t const *ins, simsimd_size_t n, simsimd_f64_t const *outs); +SIMSIMD_PUBLIC void simsimd_cos_f64_serial(simsimd_f64_t const *ins, simsimd_size_t n, simsimd_f64_t const *outs); +SIMSIMD_PUBLIC void simsimd_sin_f32_serial(simsimd_f32_t const *ins, simsimd_size_t n, simsimd_f32_t const *outs); +SIMSIMD_PUBLIC void simsimd_cos_f32_serial(simsimd_f32_t const *ins, simsimd_size_t n, simsimd_f32_t const *outs); + +SIMSIMD_PUBLIC void simsimd_sin_f64_neon(simsimd_f64_t const *ins, simsimd_size_t n, simsimd_f64_t const *outs); +SIMSIMD_PUBLIC void simsimd_cos_f64_neon(simsimd_f64_t const *ins, simsimd_size_t n, simsimd_f64_t const *outs); +SIMSIMD_PUBLIC void simsimd_sin_f32_neon(simsimd_f32_t const *ins, simsimd_size_t n, simsimd_f32_t const *outs); +SIMSIMD_PUBLIC void simsimd_cos_f32_neon(simsimd_f32_t const *ins, simsimd_size_t n, simsimd_f32_t const *outs); + +/* SIMD-powered backends for AVX2 CPUs of Haswell generation and newer, using 32-bit arithmetic over 256-bit words. + * First demonstrated in 2011, at least one Haswell-based processor was still being sold in 2022 — the Pentium G3420. + * Practically all modern x86 CPUs support AVX2, FMA, and F16C, making it a perfect baseline for SIMD algorithms. + * On other hand, there is no need to implement AVX2 versions of `f32` and `f64` functions, as those are + * properly vectorized by recent compilers. + */ +SIMSIMD_PUBLIC void simsimd_sin_f64_haswell(simsimd_f64_t const *ins, simsimd_size_t n, simsimd_f64_t const *outs); +SIMSIMD_PUBLIC void simsimd_cos_f64_haswell(simsimd_f64_t const *ins, simsimd_size_t n, simsimd_f64_t const *outs); +SIMSIMD_PUBLIC void simsimd_sin_f32_haswell(simsimd_f32_t const *ins, simsimd_size_t n, simsimd_f32_t const *outs); +SIMSIMD_PUBLIC void simsimd_cos_f32_haswell(simsimd_f32_t const *ins, simsimd_size_t n, simsimd_f32_t const *outs); + +/* SIMD-powered backends for various generations of AVX512 CPUs. + * Skylake is handy, as it supports masked loads and other operations, avoiding the need for the tail loop. + */ +SIMSIMD_PUBLIC void simsimd_sin_f64_skylake(simsimd_f64_t const *ins, simsimd_size_t n, simsimd_f64_t const *outs); +SIMSIMD_PUBLIC void simsimd_cos_f64_skylake(simsimd_f64_t const *ins, simsimd_size_t n, simsimd_f64_t const *outs); +SIMSIMD_PUBLIC void simsimd_sin_f32_skylake(simsimd_f32_t const *ins, simsimd_size_t n, simsimd_f32_t const *outs); +SIMSIMD_PUBLIC void simsimd_cos_f32_skylake(simsimd_f32_t const *ins, simsimd_size_t n, simsimd_f32_t const *outs); + +/** + * @brief Computes an approximate sine of the given angle in radians with 3 ULP error bound. + * @see xfastsinf_u3500 in SLEEF library + * @param angle The input angle in radians. + * @return The approximate sine of the input angle. + */ +SIMSIMD_PUBLIC simsimd_f32_t simsimd_f32_sin(simsimd_f32_t angle) { + // Variables + int multiple_of_pi; // The integer multiple of π in the input angle + simsimd_f32_t result; // The final result of the sine computation + simsimd_f32_t angle_squared; // Square of the reduced angle + + // Constants for argument reduction + simsimd_f32_t const pi_reciprocal = 0.31830988618379067154f; // 1/π + simsimd_f32_t const pi = 3.14159265358979323846f; // π + + // Compute multiple_of_pi = round(angle / π) + simsimd_f32_t quotient = angle * pi_reciprocal; + if (quotient >= 0.0f) { multiple_of_pi = (int)(quotient + 0.5f); } + else { multiple_of_pi = (int)(quotient - 0.5f); } + + // Reduce the angle: angle = angle - (multiple_of_pi * π) + angle = angle - multiple_of_pi * pi; + + // Compute the square of the reduced angle + angle_squared = angle * angle; + + // Polynomial coefficients for sine approximation (minimax polynomial) + simsimd_f32_t const coeff_5 = -0.0001881748176f; // Coefficient for x^5 term + simsimd_f32_t const coeff_3 = 0.008323502727f; // Coefficient for x^3 term + simsimd_f32_t const coeff_1 = -0.1666651368f; // Coefficient for x term + + // Compute the polynomial approximation + simsimd_f32_t polynomial = coeff_5; + polynomial = polynomial * angle_squared + coeff_3; // polynomial = (coeff_5 * x^2) + coeff_3 + polynomial = polynomial * angle_squared + coeff_1; // polynomial = polynomial * x^2 + coeff_1 + + // Compute the final result: sine approximation + result = ((angle_squared * angle) * polynomial) + angle; // result = (x^3 * polynomial) + x + + // If multiple_of_pi is odd, flip the sign of the result + if ((multiple_of_pi & 1) != 0) { result = -result; } + + return result; +} + +/** + * @brief Computes an approximate cosine of the given angle in radians with 3 ULP error bound. + * @see xfastcosf_u3500 in SLEEF library + * @param angle The input angle in radians. + * @return The approximate cosine of the input angle. + */ +SIMSIMD_PUBLIC simsimd_f32_t simsimd_f32_cos(simsimd_f32_t angle) { + // Variables + int multiple_of_pi; // The integer multiple of π in the input angle + simsimd_f32_t result; // The final result of the cosine computation + simsimd_f32_t angle_squared; // Square of the reduced angle + simsimd_f32_t reduced_angle; // The angle reduced to the primary interval + + // Constants for argument reduction + simsimd_f32_t const pi_reciprocal = 0.31830988618379067154f; // 1/π + simsimd_f32_t const pi = 3.14159265358979323846f; // π + + // Compute multiple_of_pi = round(angle * (1/π) - 0.5) + simsimd_f32_t quotient = angle * pi_reciprocal - 0.5f; + if (quotient >= 0.0f) { multiple_of_pi = (int)(quotient + 0.5f); } + else { multiple_of_pi = (int)(quotient - 0.5f); } + + // Reduce the angle: angle = angle - (multiple_of_pi + 0.5) * π + reduced_angle = angle - (multiple_of_pi + 0.5f) * pi; + + // Compute the square of the reduced angle + angle_squared = reduced_angle * reduced_angle; + + // Polynomial coefficients for cosine approximation (minimax polynomial) + simsimd_f32_t const coeff_5 = -0.0001881748176f; // Coefficient for x^5 term + simsimd_f32_t const coeff_3 = 0.008323502727f; // Coefficient for x^3 term + simsimd_f32_t const coeff_1 = -0.1666651368f; // Coefficient for x^1 term + + // Compute the polynomial approximation + simsimd_f32_t polynomial = coeff_5; + polynomial = polynomial * angle_squared + coeff_3; // polynomial = (coeff_5 * x^2) + coeff_3 + polynomial = polynomial * angle_squared + coeff_1; // polynomial = polynomial * x^2 + coeff_1 + + // Compute the final result: cosine approximation + result = ((angle_squared * reduced_angle) * polynomial) + reduced_angle; // result = (x^3 * polynomial) + x + + // If multiple_of_pi is even, flip the sign of the result + if ((multiple_of_pi & 1) == 0) { result = -result; } + + return result; +} + +/** + * @brief Computes an approximate cosine of the given angle in radians with 0 ULP error bound. + * @see `xsin` in SLEEF library + * @param angle The input angle in radians. + * @return The approximate cosine of the input angle. + */ +SIMSIMD_PUBLIC simsimd_f64_t simsimd_f64_sin(simsimd_f64_t angle) { + // Constants for bit manipulation + simsimd_i64_t const negative_zero = 0x8000000000000000LL; // Hexadecimal value of -0.0 in IEEE 754 + + // Union for bit manipulation between simsimd_f64_t and simsimd_i64_t + union { + simsimd_f64_t f64; + simsimd_i64_t i64; + } converter; + + // Preserve the original angle for special case handling (negative zero) + simsimd_f64_t original_angle = angle; + + // Constants for argument reduction + simsimd_f64_t const pi_reciprocal = 0.318309886183790671537767526745028724; // 1/π + simsimd_f64_t const pi_high = 3.141592653589793116; // High-precision part of π + simsimd_f64_t const pi_low = 1.2246467991473532072e-16; // Low-precision part of π + + // Compute multiple_of_pi = round(angle / π) + simsimd_f64_t quotient = angle * pi_reciprocal; + int multiple_of_pi = (int)(quotient < 0 ? quotient - 0.5 : quotient + 0.5); + + // Reduce the angle: angle = angle - (multiple_of_pi * π) + angle = (multiple_of_pi * -pi_high) + angle; + angle = (multiple_of_pi * -pi_low) + angle; + + // Compute the square of the reduced argument + simsimd_f64_t argument_square = angle * angle; + + // Adjust the sign of the angle if multiple_of_pi is odd + if ((multiple_of_pi & 1) != 0) { angle = -angle; } + + // Compute higher powers of the argument + simsimd_f64_t argument_power_4 = argument_square * argument_square; // angle^4 + simsimd_f64_t argument_power_8 = argument_power_4 * argument_power_4; // angle^8 + + // Polynomial coefficients for sine approximation (minimax polynomial) + simsimd_f64_t const coeff_0 = 0.00833333333333332974823815; + simsimd_f64_t const coeff_1 = -0.000198412698412696162806809; + simsimd_f64_t const coeff_2 = 2.75573192239198747630416e-06; + simsimd_f64_t const coeff_3 = -2.50521083763502045810755e-08; + simsimd_f64_t const coeff_4 = 1.60590430605664501629054e-10; + simsimd_f64_t const coeff_5 = -7.64712219118158833288484e-13; + simsimd_f64_t const coeff_6 = 2.81009972710863200091251e-15; + simsimd_f64_t const coeff_7 = -7.97255955009037868891952e-18; + simsimd_f64_t const coeff_8 = -0.166666666666666657414808; + + // Compute higher-degree polynomial terms + simsimd_f64_t temp1 = (argument_square * coeff_7) + coeff_6; // temp1 = s * c7 + c6 + simsimd_f64_t temp2 = (argument_square * coeff_5) + coeff_4; // temp2 = s * c5 + c4 + simsimd_f64_t poly_high = (argument_power_4 * temp1) + temp2; // poly_high = s^4 * temp1 + temp2 + + // Compute lower-degree polynomial terms + simsimd_f64_t temp3 = (argument_square * coeff_3) + coeff_2; // temp3 = s * c3 + c2 + simsimd_f64_t temp4 = (argument_square * coeff_1) + coeff_0; // temp4 = s * c1 + c0 + simsimd_f64_t poly_low = (argument_power_4 * temp3) + temp4; // poly_low = s^4 * temp3 + temp4 + + // Combine polynomial terms + simsimd_f64_t result = (argument_power_8 * poly_high) + poly_low; // result = s^8 * poly_high + poly_low + result = (result * argument_square) + coeff_8; // result = result * s + c8 + result = (argument_square * (result * angle)) + angle; // result = s * (result * angle) + angle + + // Handle the special case of negative zero input + converter.f64 = original_angle; + if (converter.i64 == negative_zero) { result = original_angle; } + + return result; +} + +/** + * @brief Computes an approximate cosine of the given angle in radians with 0 ULP error bound. + * @see `xcos` in SLEEF library + * @param angle The input angle in radians. + * @return The approximate cosine of the input angle. + */ +SIMSIMD_PUBLIC simsimd_f64_t simsimd_f64_cos(simsimd_f64_t angle) { + // Constants for bit manipulation + simsimd_i64_t const negative_zero = 0x8000000000000000LL; // Hexadecimal value of -0.0 in IEEE 754 + + // Union for bit manipulation between simsimd_f64_t and simsimd_i64_t + union { + simsimd_f64_t f64; + simsimd_i64_t i64; + } converter; + + // Variables + simsimd_f64_t result; // The final result of the cosine computation + simsimd_f64_t angle_squared; // Square of the reduced angle + simsimd_f64_t original_angle = angle; // Preserve the original angle for special case handling + int multiple_of_pi; // The integer multiple of π in the input angle + + // Constants for argument reduction + simsimd_f64_t const pi_reciprocal = 0.318309886183790671537767526745028724; // 1/π + simsimd_f64_t const pi_high = 3.141592653589793116; // High-precision part of π + simsimd_f64_t const pi_low = 1.2246467991473532072e-16; // Low-precision part of π + + // Compute multiple_of_pi = 2 * round(angle * (1/π) - 0.5) + 1 + simsimd_f64_t quotient = angle * pi_reciprocal - 0.5; + int temp = (int)(quotient < 0 ? quotient - 0.5 : quotient + 0.5); + multiple_of_pi = 2 * temp + 1; + + // Reduce the angle: angle = angle - multiple_of_pi * (π / 2) + angle = angle - multiple_of_pi * (pi_high * 0.5); + angle = angle - multiple_of_pi * (pi_low * 0.5); + + // Compute the square of the reduced angle + angle_squared = angle * angle; + + // Adjust the sign of the angle if necessary + if ((multiple_of_pi & 2) == 0) { angle = -angle; } + + // Compute higher powers of the argument + simsimd_f64_t angle_power_2 = angle_squared; + simsimd_f64_t angle_power_4 = angle_power_2 * angle_power_2; // angle^4 + simsimd_f64_t angle_power_8 = angle_power_4 * angle_power_4; // angle^8 + + // Polynomial coefficients for cosine approximation (minimax polynomial) + simsimd_f64_t const coeff_0 = 0.00833333333333332974823815; + simsimd_f64_t const coeff_1 = -0.000198412698412696162806809; + simsimd_f64_t const coeff_2 = 2.75573192239198747630416e-06; + simsimd_f64_t const coeff_3 = -2.50521083763502045810755e-08; + simsimd_f64_t const coeff_4 = 1.60590430605664501629054e-10; + simsimd_f64_t const coeff_5 = -7.64712219118158833288484e-13; + simsimd_f64_t const coeff_6 = 2.81009972710863200091251e-15; + simsimd_f64_t const coeff_7 = -7.97255955009037868891952e-18; + simsimd_f64_t const coeff_8 = -0.166666666666666657414808; + + // Compute higher-degree polynomial terms + simsimd_f64_t temp1 = (angle_squared * coeff_7) + coeff_6; // temp1 = s * c7 + c6 + simsimd_f64_t temp2 = (angle_squared * coeff_5) + coeff_4; // temp2 = s * c5 + c4 + simsimd_f64_t poly_high = (angle_power_4 * temp1) + temp2; // poly_high = s^4 * temp1 + temp2 + + // Compute lower-degree polynomial terms + simsimd_f64_t temp3 = (angle_squared * coeff_3) + coeff_2; // temp3 = s * c3 + c2 + simsimd_f64_t temp4 = (angle_squared * coeff_1) + coeff_0; // temp4 = s * c1 + c0 + simsimd_f64_t poly_low = (angle_power_4 * temp3) + temp4; // poly_low = s^4 * temp3 + temp4 + + // Combine polynomial terms + result = (angle_power_8 * poly_high) + poly_low; // result = s^8 * poly_high + poly_low + result = (result * angle_squared) + coeff_8; // result = result * s + c8 + result = (angle_squared * (result * angle)) + angle; // result = s * (result * angle) + angle + + // Return the final result + return result; +} + +#if _SIMSIMD_TARGET_X86 +#if SIMSIMD_TARGET_HASWELL +#pragma GCC push_options +#pragma GCC target("avx2", "f16c", "fma") +#pragma clang attribute push(__attribute__((target("avx2,f16c,fma"))), apply_to = function) + +#pragma clang attribute pop +#pragma GCC pop_options +#endif // SIMSIMD_TARGET_HASWELL +#endif // _SIMSIMD_TARGET_X86 + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/scripts/bench.cxx b/scripts/bench.cxx index 6907f8c6..1661bd63 100644 --- a/scripts/bench.cxx +++ b/scripts/bench.cxx @@ -511,6 +511,7 @@ void measure_elementwise(bm::State &state, kernel_at kernel, kernel_at baseline, } else if constexpr (kernel_ak == simsimd_sum_k) { baseline(a.data(), c.data(), a.dimensions(), d.data()); } else if constexpr (kernel_ak == simsimd_scale_k) { baseline(a.data(), a.dimensions(), alpha, beta, d.data()); } + else { baseline(a.data(), a.dimensions(), d.data()); } }; auto call_contender = [&](vector_t const &a, vector_t const &b, vector_t const &c, vector_t &d) { if constexpr (kernel_ak == simsimd_wsum_k) { @@ -521,6 +522,7 @@ void measure_elementwise(bm::State &state, kernel_at kernel, kernel_at baseline, } else if constexpr (kernel_ak == simsimd_sum_k) { kernel(a.data(), c.data(), a.dimensions(), d.data()); } else if constexpr (kernel_ak == simsimd_scale_k) { kernel(a.data(), a.dimensions(), alpha, beta, d.data()); } + else { kernel(a.data(), a.dimensions(), d.data()); } }; // Let's average the distance results over many quads. @@ -580,6 +582,96 @@ void measure_elementwise(bm::State &state, kernel_at kernel, kernel_at baseline, state.counters["pairs"] = bm::Counter(iterations, bm::Counter::kIsRate); } +/** + * @brief Measures the performance of a geospatial operations between 4 arrays: 2 latitudes, 2 longitudes. + * @tparam pair_at The type representing the vector pair used in the measurement. + * @tparam kernel_at The type of the kernel function (default is void). + * @param state The benchmark state object provided by Google Benchmark. + * @param kernel The kernel function to benchmark. + * @param baseline The baseline function to compare against. + * @param l2_metric The L2 metric function to compute the error + * @param dimensions The number of dimensions in the vectors. + */ +template +void measure_geospatial(bm::State &state, kernel_at kernel, kernel_at baseline, l2_metric_at l2_metric, + std::size_t dimensions) { + + using pair_t = pair_at; + using vector_t = typename pair_at::vector_t; + using scalar_t = typename vector_t::scalar_t; + struct quad_t { + vector_t lat1, lon1, lat2, lon2; + }; + + using distances_t = vector_gt; + auto call_baseline = [&](quad_t const &quad, distances_t &d) { + baseline(quad.lat1.data(), quad.lon1.data(), quad.lat2.data(), quad.lon2.data(), quad.lat1.dimensions(), + d.data()); + }; + auto call_contender = [&](quad_t const &quad, distances_t &d) { + kernel(quad.lat1.data(), quad.lon1.data(), quad.lat2.data(), quad.lon2.data(), quad.lat1.dimensions(), + d.data()); + }; + + // Let's average the distance results over many quads. + constexpr std::size_t quads_count = 128; + std::vector quads(quads_count); + + std::random_device random_device; + std::mt19937 random_generator(random_device()); + /// Latitude range (-90 to 90 degrees) in radians + std::uniform_real_distribution lat_dist(-M_PI_2, M_PI_2); + /// Longitude range (-180 to 180 degrees) in radians + std::uniform_real_distribution lon_dist(-M_PI, M_PI); + for (std::size_t i = 0; i != quads.size(); ++i) { + auto &quad = quads[i]; + quad.lat1 = quad.lat2 = quad.lon1 = quad.lon2 = vector_t(dimensions); + std::generate(quad.lat1.data(), quad.lat1.data() + dimensions, [&]() { return lat_dist(random_generator); }); + std::generate(quad.lat2.data(), quad.lat2.data() + dimensions, [&]() { return lat_dist(random_generator); }); + std::generate(quad.lon1.data(), quad.lon1.data() + dimensions, [&]() { return lon_dist(random_generator); }); + std::generate(quad.lon2.data(), quad.lon2.data() + dimensions, [&]() { return lon_dist(random_generator); }); + } + + // Initialize the output buffers for distance calculations. + distances_t baseline_d(dimensions), contender_d(dimensions), zeros(dimensions); + std::vector l2_metric_from_baseline(quads.size()); + std::vector l2_baseline_result_norm(quads.size()); + std::vector l2_contender_result_norm(quads.size()); + zeros.set(0); + double mean_delta = 0, mean_relative_error = 0; + for (std::size_t i = 0; i != quads.size(); ++i) { + quad_t &quad = quads[i]; + call_baseline(quad, baseline_d); + call_contender(quad, contender_d); + l2_metric(baseline_d.data(), contender_d.data(), dimensions, &l2_metric_from_baseline[i]); + l2_metric(baseline_d.data(), zeros.data(), dimensions, &l2_baseline_result_norm[i]); + l2_metric(contender_d.data(), zeros.data(), dimensions, &l2_contender_result_norm[i]); + + mean_delta += std::abs(l2_metric_from_baseline[i]); + mean_relative_error += + std::abs(l2_metric_from_baseline[i]) / (std::max)(l2_baseline_result_norm[i], l2_contender_result_norm[i]); + } + mean_delta /= quads_count; + mean_relative_error /= quads_count; + + // The actual benchmarking loop. + std::size_t iterations = 0; + for (auto _ : state) { + quad_t &quad = quads[iterations & (quads_count - 1)]; + call_contender(quad, contender_d); + iterations++; + } + + std::size_t const bytes_per_call = // + quads[0].lat1.size_bytes() + quads[0].lat2.size_bytes() + // + quads[0].lon1.size_bytes() + quads[0].lon2.size_bytes(); + // Measure the mean absolute delta and relative error. + state.counters["abs_delta"] = mean_delta; + state.counters["relative_error"] = mean_relative_error; + state.counters["bytes"] = bm::Counter(iterations * bytes_per_call, bm::Counter::kIsRate); + state.counters["pairs"] = bm::Counter(iterations * dimensions, bm::Counter::kIsRate); +} + template void dense_(std::string name, metric_at *distance_func, metric_at *baseline_func) { using pair_t = vectors_pair_gt; @@ -590,8 +682,8 @@ void dense_(std::string name, metric_at *distance_func, metric_at *baseline_func ->Threads(default_threads); } -template +template void elementwise_(std::string name, kernel_at *kernel_func, kernel_at *baseline_func, l2_metric_at *l2_metric_func) { using pair_t = vectors_pair_gt; std::string bench_name = name + "<" + std::to_string(dense_dimensions) + "d>"; @@ -601,6 +693,16 @@ void elementwise_(std::string name, kernel_at *kernel_func, kernel_at *baseline_ ->Threads(default_threads); } +template +void geospatial_(std::string name, kernel_at *kernel_func, kernel_at *baseline_func, l2_metric_at *l2_metric_func) { + using pair_t = vectors_pair_gt; + std::string bench_name = name + "<" + std::to_string(dense_dimensions) + "d>"; + bm::RegisterBenchmark(bench_name.c_str(), measure_geospatial, kernel_func, + baseline_func, l2_metric_func, dense_dimensions) + ->MinTime(default_seconds) + ->Threads(default_threads); +} + template void sparse_(std::string name, metric_at *distance_func, metric_at *baseline_func) { @@ -647,6 +749,57 @@ void l2_with_stl(scalar_at const *a, scalar_at const *b, simsimd_size_t n, simsi *result = std::sqrt(sum); } +template +simsimd_distance_t haversine_one_with_stl(scalar_at lat1, scalar_at lon1, scalar_at lat2, scalar_at lon2) { + // Convert angle to radians: + // lat1 *= M_PI / 180, lon1 *= M_PI / 180; + // lat2 *= M_PI / 180, lon2 *= M_PI / 180; + accumulator_at dlat = lat2 - lat1; + accumulator_at dlon = lon2 - lon1; + accumulator_at a = // + std::sin(dlat / 2) * std::sin(dlat / 2) + + std::cos(lat1) * std::cos(lat2) * std::sin(dlon / 2) * std::sin(dlon / 2); + accumulator_at c = 2 * std::atan2(std::sqrt(a), std::sqrt(1 - a)); + return c; +} + +template +void haversine_with_stl( // + scalar_at const *a_lats, scalar_at const *a_lons, // + scalar_at const *b_lats, scalar_at const *b_lons, // + simsimd_size_t n, simsimd_distance_t *results) { + for (simsimd_size_t i = 0; i != n; ++i) { + scalar_at lat1 = a_lats[i], lon1 = a_lons[i]; + scalar_at lat2 = b_lats[i], lon2 = b_lons[i]; + results[i] = haversine_one_with_stl(lat1, lon1, lat2, lon2); + } +} + +template +struct sin_with_stl { + scalar_at operator()(scalar_at const &a) const { return std::sin(a); } +}; +template +struct cos_with_stl { + scalar_at operator()(scalar_at const &a) const { return std::cos(a); } +}; + +namespace av::simsimd { +struct sin { + simsimd_f32_t operator()(simsimd_f32_t const &a) const { return simsimd_f32_sin(a); } + simsimd_f64_t operator()(simsimd_f64_t const &a) const { return simsimd_f64_sin(a); } +}; +struct cos { + simsimd_f32_t operator()(simsimd_f32_t const &a) const { return simsimd_f32_cos(a); } + simsimd_f64_t operator()(simsimd_f64_t const &a) const { return simsimd_f64_cos(a); } +}; +} // namespace av::simsimd + +template +void elementwise_with_stl(scalar_at const *ins, simsimd_size_t n, scalar_at *outs) { + for (simsimd_size_t i = 0; i != n; ++i) outs[i] = kernel_at {}(ins[i]); +} + #if SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS void dot_f32_blas(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_size_t n, simsimd_distance_t *result) { @@ -768,6 +921,24 @@ int main(int argc, char **argv) { constexpr simsimd_datatype_t f16c_k = simsimd_f16c_k; constexpr simsimd_datatype_t bf16c_k = simsimd_bf16c_k; + elementwise_("f32_sin_stl", elementwise_with_stl>, + elementwise_with_stl>, l2_with_stl); + elementwise_("f32_cos_stl", elementwise_with_stl>, + elementwise_with_stl>, l2_with_stl); + elementwise_("f32_sin", elementwise_with_stl, + elementwise_with_stl>, l2_with_stl); + elementwise_("f32_cos", elementwise_with_stl, + elementwise_with_stl>, l2_with_stl); + + elementwise_("f64_sin_stl", elementwise_with_stl>, + elementwise_with_stl>, l2_with_stl); + elementwise_("f64_cos_stl", elementwise_with_stl>, + elementwise_with_stl>, l2_with_stl); + elementwise_("f64_sin", elementwise_with_stl, + elementwise_with_stl>, l2_with_stl); + elementwise_("f64_cos", elementwise_with_stl, + elementwise_with_stl>, l2_with_stl); + #if SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS dense_("dot_f32_blas", dot_f32_blas, simsimd_dot_f32_accurate); @@ -1179,6 +1350,11 @@ int main(int argc, char **argv) { elementwise_("wsum_i8_serial", simsimd_wsum_i8_serial, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); + geospatial_("haversine_f32_serial", haversine_with_stl, + haversine_with_stl, l2_with_stl); + geospatial_("haversine_f64_serial", haversine_with_stl, haversine_with_stl, + l2_with_stl); + bm::RunSpecifiedBenchmarks(); bm::Shutdown(); return 0; diff --git a/scripts/test.c b/scripts/test.c index f032bb34..6c21f830 100644 --- a/scripts/test.c +++ b/scripts/test.c @@ -283,7 +283,80 @@ void test_xd_index(void) { } } -void test_saturating_arithmeic(void) {} +/** + * @brief Goes through all possible `f32` values in a relevant range, computing + */ +void test_approximate_math(void) { + union { + simsimd_f32_t f32; + simsimd_u32_t u32; + } x; + + struct error_aggregator { + simsimd_f64_t absolute_error; + simsimd_f64_t relative_error; + simsimd_f64_t max_error; + } f32_cos_errors = {0, 0, 0}, f32_sin_errors = {0, 0, 0}, f64_cos_errors = {0, 0, 0}, f64_sin_errors = {0, 0, 0}; + + // Test all possible values of f32 within ranges: [-π, -1], [-1, -0], [0, 1], [1, π]. + simsimd_size_t count_tests = 0; + for (x.u32 = 0; x.u32 < 0xFFFFFFFF; ++x.u32) { + if (x.f32 >= -3.14159265358979323846f && x.f32 <= 3.14159265358979323846f) { + + simsimd_f64_t f64_sin_baseline = sin(x.f32); + simsimd_f64_t f64_sin_approx = simsimd_f64_sin(x.f32); + simsimd_f64_t f64_sin_diff = fabs(f64_sin_baseline - f64_sin_approx); + simsimd_f64_t f64_sin_max = fmax(fabs(f64_sin_baseline), fabs(f64_sin_approx)); + f64_sin_errors.absolute_error += f64_sin_diff; + f64_sin_errors.relative_error += f64_sin_max != 0 ? f64_sin_diff / f64_sin_max : 0; + f64_sin_errors.max_error = fmax(f64_sin_errors.max_error, f64_sin_diff); + + simsimd_f32_t f32_sin_baseline = sinf(x.f32); + simsimd_f32_t f32_sin_approx = simsimd_f32_sin(x.f32); + simsimd_f32_t f32_sin_diff = fabsf(f32_sin_baseline - f32_sin_approx); + simsimd_f32_t f32_sin_max = fmaxf(fabsf(f32_sin_baseline), fabsf(f32_sin_approx)); + f32_sin_errors.absolute_error += f32_sin_diff; + f32_sin_errors.relative_error += f32_sin_max != 0 ? f32_sin_diff / f32_sin_max : 0; + f32_sin_errors.max_error = fmax(f32_sin_errors.max_error, f32_sin_diff); + + simsimd_f64_t f64_cos_baseline = cos(x.f32); + simsimd_f64_t f64_cos_approx = simsimd_f64_cos(x.f32); + simsimd_f64_t f64_cos_diff = fabs(f64_cos_baseline - f64_cos_approx); + simsimd_f64_t f64_cos_max = fmax(fabs(f64_cos_baseline), fabs(f64_cos_approx)); + f64_cos_errors.absolute_error += f64_cos_diff; + f64_cos_errors.relative_error += f64_cos_max != 0 ? f64_cos_diff / f64_cos_max : 0; + f64_cos_errors.max_error = fmax(f64_cos_errors.max_error, f64_cos_diff); + + simsimd_f32_t f32_cos_baseline = cosf(x.f32); + simsimd_f32_t f32_cos_approx = simsimd_f32_cos(x.f32); + simsimd_f32_t f32_cos_diff = fabsf(f32_cos_baseline - f32_cos_approx); + simsimd_f32_t f32_cos_max = fmaxf(fabsf(f32_cos_baseline), fabsf(f32_cos_approx)); + f32_cos_errors.absolute_error += f32_cos_diff; + f32_cos_errors.relative_error += f32_cos_max != 0 ? f32_cos_diff / f32_cos_max : 0; + f32_cos_errors.max_error = fmax(f32_cos_errors.max_error, f32_cos_diff); + + ++count_tests; + } + } + + f32_sin_errors.absolute_error /= count_tests; + f32_sin_errors.relative_error *= 100 / count_tests; + f32_cos_errors.absolute_error /= count_tests; + f32_cos_errors.relative_error *= 100 / count_tests; + f64_sin_errors.absolute_error /= count_tests; + f64_sin_errors.relative_error *= 100 / count_tests; + f64_cos_errors.absolute_error /= count_tests; + f64_cos_errors.relative_error *= 100 / count_tests; + + printf("f32 sin errors: %f or %.6f%%, peaking at %f\n", f32_sin_errors.absolute_error, + f32_sin_errors.relative_error, f32_sin_errors.max_error); + printf("f32 cos errors: %f or %.6f%%, peaking at %f\n", f32_cos_errors.absolute_error, + f32_cos_errors.relative_error, f32_cos_errors.max_error); + printf("f64 sin errors: %f or %.6f%%, peaking at %f\n", f64_sin_errors.absolute_error, + f64_sin_errors.relative_error, f64_sin_errors.max_error); + printf("f64 cos errors: %f or %.6f%%, peaking at %f\n", f64_cos_errors.absolute_error, + f64_cos_errors.relative_error, f64_cos_errors.max_error); +} /** * @brief A trivial test that calls every implemented distance function and their dispatch versions @@ -359,6 +432,7 @@ int main(int argc, char **argv) { print_capabilities(); test_utilities(); test_saturating_arithmetic(); + test_approximate_math(); test_xd_index(); test_distance_from_itself(); printf("All tests passed.\n");