Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster Loads & Broader Tests #210

Merged
merged 9 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
include include/simsimd/*.h
include include/*
include include/*
VERSION
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ You can learn more about the technical implementation details in the following b
For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements:

| Type | Apple M2 Pro | Intel Sapphire Rapids | AWS Graviton 4 |
| Type | Apple M2 Pro | AMD Genoa | AWS Graviton 4 |
| :----- | ---------------------------------: | ---------------------------------: | ---------------------------------: |
| `f64` | 18.5 → 28.8 GB/s <br/> + 56 % | 21.9 → 41.4 GB/s <br/> + 89 % | 20.7 → 41.3 GB/s <br/> + 99 % |
| `f32` | 9.2 → 29.6 GB/s <br/> + 221 % | 10.9 → 95.8 GB/s <br/> + 779 % | 4.9 → 41.9 GB/s <br/> + 755 % |
Expand Down Expand Up @@ -748,12 +748,16 @@ In general there are a few principles that SimSIMD follows:
- Avoid loop unrolling.
- Never allocate memory.
- Never throw exceptions or set `errno`.
- Detect overflows and report the distance with a "signaling" `NaN`.
- Keep all function arguments the size of the pointer.
- Avoid returning from public interfaces, use out-arguments instead.
- Don't over-optimize for old CPUs and single- and double-precision floating-point numbers.
- Prioritize mixed-precision and integer operations, and new ISA extensions.

Possibly, in the future:

- Best effort computation silencing `NaN` components in low-precision inputs.
- Detect overflows and report the distance with a "signaling" `NaN`.

Last, but not the least - don't build unless there is a demand for it.
So if you have a specific use-case, please open an issue or a pull request, and ideally, bring in more users with similar needs.

Expand Down
200 changes: 164 additions & 36 deletions include/simsimd/binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,54 +165,182 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_sve(simsimd_b8_t const* a, simsimd_b8_t c

SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words,
simsimd_distance_t* result) {
__m512i differences_vec = _mm512_setzero_si512();
__m512i a_vec, b_vec;

simsimd_hamming_b8_ice_cycle:
if (n_words < 64) {
simsimd_size_t xor_count;
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
if (n_words <= 64) { // Up to 512 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_vec = _mm512_maskz_loadu_epi8(mask, a);
b_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a);
__m512i b_vec = _mm512_maskz_loadu_epi8(mask, b);
__m512i xor_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a_vec, b_vec));
xor_count = _mm512_reduce_add_epi64(xor_count_vec);
} else if (n_words <= 128) { // Up to 1024 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 64);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_maskz_loadu_epi8(mask, a + 64);
__m512i b2_vec = _mm512_maskz_loadu_epi8(mask, b + 64);
__m512i xor1_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a1_vec, b1_vec));
__m512i xor2_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a2_vec, b2_vec));
xor_count = _mm512_reduce_add_epi64(_mm512_add_epi64(xor2_count_vec, xor1_count_vec));
} else if (n_words <= 196) { // Up to 1568 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
__m512i b2_vec = _mm512_loadu_epi8(b + 64);
__m512i a3_vec = _mm512_maskz_loadu_epi8(mask, a + 128);
__m512i b3_vec = _mm512_maskz_loadu_epi8(mask, b + 128);
__m512i xor1_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a1_vec, b1_vec));
__m512i xor2_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a2_vec, b2_vec));
__m512i xor3_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a3_vec, b3_vec));
xor_count =
_mm512_reduce_add_epi64(_mm512_add_epi64(xor3_count_vec, _mm512_add_epi64(xor2_count_vec, xor1_count_vec)));
} else if (n_words <= 256) { // Up to 2048 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 192);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
__m512i b2_vec = _mm512_loadu_epi8(b + 64);
__m512i a3_vec = _mm512_loadu_epi8(a + 128);
__m512i b3_vec = _mm512_loadu_epi8(b + 128);
__m512i a4_vec = _mm512_maskz_loadu_epi8(mask, a + 192);
__m512i b4_vec = _mm512_maskz_loadu_epi8(mask, b + 192);
__m512i xor1_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a1_vec, b1_vec));
__m512i xor2_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a2_vec, b2_vec));
__m512i xor3_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a3_vec, b3_vec));
__m512i xor4_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a4_vec, b4_vec));
xor_count = _mm512_reduce_add_epi64(_mm512_add_epi64(_mm512_add_epi64(xor4_count_vec, xor3_count_vec),
_mm512_add_epi64(xor2_count_vec, xor1_count_vec)));
} else {
a_vec = _mm512_loadu_epi8(a);
b_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
__m512i xor_count_vec = _mm512_setzero_si512();
__m512i a_vec, b_vec;

simsimd_hamming_b8_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_vec = _mm512_maskz_loadu_epi8(mask, a);
b_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
} else {
a_vec = _mm512_loadu_epi8(a);
b_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
}
__m512i xor_vec = _mm512_xor_si512(a_vec, b_vec);
xor_count_vec = _mm512_add_epi64(xor_count_vec, _mm512_popcnt_epi64(xor_vec));
if (n_words)
goto simsimd_hamming_b8_ice_cycle;

xor_count = _mm512_reduce_add_epi64(xor_count_vec);
}
__m512i xor_vec = _mm512_xor_si512(a_vec, b_vec);
differences_vec = _mm512_add_epi64(differences_vec, _mm512_popcnt_epi64(xor_vec));
if (n_words)
goto simsimd_hamming_b8_ice_cycle;

simsimd_size_t differences = _mm512_reduce_add_epi64(differences_vec);
*result = differences;
*result = xor_count;
}

SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t const* b, simsimd_size_t n_words,
simsimd_distance_t* result) {
__m512i intersection_vec = _mm512_setzero_si512(), union_vec = _mm512_setzero_si512();
__m512i a_vec, b_vec;

simsimd_jaccard_b8_ice_cycle:
if (n_words < 64) {
simsimd_size_t intersection = 0, union_ = 0;
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
if (n_words <= 64) { // Up to 512 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_vec = _mm512_maskz_loadu_epi8(mask, a);
b_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a);
__m512i b_vec = _mm512_maskz_loadu_epi8(mask, b);
__m512i and_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a_vec, b_vec));
__m512i or_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a_vec, b_vec));
intersection = _mm512_reduce_add_epi64(and_count_vec);
union_ = _mm512_reduce_add_epi64(or_count_vec);
} else if (n_words <= 128) { // Up to 1024 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 64);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_maskz_loadu_epi8(mask, a + 64);
__m512i b2_vec = _mm512_maskz_loadu_epi8(mask, b + 64);
__m512i and1_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a1_vec, b1_vec));
__m512i or1_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a1_vec, b1_vec));
__m512i and2_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a2_vec, b2_vec));
__m512i or2_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a2_vec, b2_vec));
intersection = _mm512_reduce_add_epi64(_mm512_add_epi64(and2_count_vec, and1_count_vec));
union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(or2_count_vec, or1_count_vec));
} else if (n_words <= 196) { // Up to 1568 bits.
// TODO: On such vectors we can clearly see that the CPU struggles to perform this many parallel
// population counts, because the throughput of Jaccard and Hamming in this case starts to differ.
// One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount
// lookups, to utilize other ports on the CPU.
// https://github.com/ashvardanian/SimSIMD/pull/138
//
// On Ice Lake:
// - `VPOPCNTQ (ZMM, K, ZMM)` can only execute on port 5, which is a bottleneck.
// - `VPSHUFB (ZMM, ZMM, ZMM)` can only run on the same port 5 as well!
// On Zen4:
// - `VPOPCNTQ (ZMM, K, ZMM)` can run on ports: 0, 1.
// - `VPSHUFB (ZMM, ZMM, ZMM)` can run on ports: 1, 2.
// https://uops.info/table.html?search=VPOPCNTQ%20(ZMM%2C%20K%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
// https://uops.info/table.html?search=VPSHUFB%20(ZMM%2C%20ZMM%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
__m512i b2_vec = _mm512_loadu_epi8(b + 64);
__m512i a3_vec = _mm512_maskz_loadu_epi8(mask, a + 128);
__m512i b3_vec = _mm512_maskz_loadu_epi8(mask, b + 128);
__m512i and1_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a1_vec, b1_vec));
__m512i or1_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a1_vec, b1_vec));
__m512i and2_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a2_vec, b2_vec));
__m512i or2_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a2_vec, b2_vec));
__m512i and3_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a3_vec, b3_vec));
__m512i or3_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a3_vec, b3_vec));
intersection = _mm512_reduce_add_epi64( //
_mm512_add_epi64(and3_count_vec, _mm512_add_epi64(and2_count_vec, and1_count_vec)));
union_ = _mm512_reduce_add_epi64( //
_mm512_add_epi64(or3_count_vec, _mm512_add_epi64(or2_count_vec, or1_count_vec)));
} else if (n_words <= 256) { // Up to 2048 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 192);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
__m512i b2_vec = _mm512_loadu_epi8(b + 64);
__m512i a3_vec = _mm512_loadu_epi8(a + 128);
__m512i b3_vec = _mm512_loadu_epi8(b + 128);
__m512i a4_vec = _mm512_maskz_loadu_epi8(mask, a + 192);
__m512i b4_vec = _mm512_maskz_loadu_epi8(mask, b + 192);
__m512i and1_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a1_vec, b1_vec));
__m512i or1_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a1_vec, b1_vec));
__m512i and2_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a2_vec, b2_vec));
__m512i or2_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a2_vec, b2_vec));
__m512i and3_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a3_vec, b3_vec));
__m512i or3_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a3_vec, b3_vec));
__m512i and4_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a4_vec, b4_vec));
__m512i or4_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a4_vec, b4_vec));
intersection = _mm512_reduce_add_epi64(_mm512_add_epi64(_mm512_add_epi64(and4_count_vec, and3_count_vec),
_mm512_add_epi64(and2_count_vec, and1_count_vec)));
union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(_mm512_add_epi64(or4_count_vec, or3_count_vec),
_mm512_add_epi64(or2_count_vec, or1_count_vec)));
} else {
a_vec = _mm512_loadu_epi8(a);
b_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
__m512i and_count_vec = _mm512_setzero_si512(), or_count_vec = _mm512_setzero_si512();
__m512i a_vec, b_vec;

simsimd_jaccard_b8_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
a_vec = _mm512_maskz_loadu_epi8(mask, a);
b_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
} else {
a_vec = _mm512_loadu_epi8(a);
b_vec = _mm512_loadu_epi8(b);
a += 64, b += 64, n_words -= 64;
}
__m512i and_vec = _mm512_and_si512(a_vec, b_vec);
__m512i or_vec = _mm512_or_si512(a_vec, b_vec);
and_count_vec = _mm512_add_epi64(and_count_vec, _mm512_popcnt_epi64(and_vec));
or_count_vec = _mm512_add_epi64(or_count_vec, _mm512_popcnt_epi64(or_vec));
if (n_words)
goto simsimd_jaccard_b8_ice_cycle;

intersection = _mm512_reduce_add_epi64(and_count_vec);
union_ = _mm512_reduce_add_epi64(or_count_vec);
}
__m512i and_vec = _mm512_and_si512(a_vec, b_vec);
__m512i or_vec = _mm512_or_si512(a_vec, b_vec);
intersection_vec = _mm512_add_epi64(intersection_vec, _mm512_popcnt_epi64(and_vec));
union_vec = _mm512_add_epi64(union_vec, _mm512_popcnt_epi64(or_vec));
if (n_words)
goto simsimd_jaccard_b8_ice_cycle;

simsimd_size_t intersection = _mm512_reduce_add_epi64(intersection_vec),
union_ = _mm512_reduce_add_epi64(union_vec);
*result = (union_ != 0) ? 1 - (simsimd_f64_t)intersection / (simsimd_f64_t)union_ : 1;
}

Expand Down
20 changes: 10 additions & 10 deletions include/simsimd/curved.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,8 +420,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_f16_haswell(simsimd_f16_t const* a, simsimd
__m256 a_vec = _mm256_cvtph_ps(_mm_set1_epi16(*(short const*)(a + i)));
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 b_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + j)));
__m256 c_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
__m256 b_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(b + j)));
__m256 c_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(b_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(a_vec, partial_sum_vec, sum_vec);
Expand Down Expand Up @@ -455,9 +455,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_haswell(simsimd_f16_t const* a, sims
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 diff_j_vec = _mm256_sub_ps( //
_mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(a + j))),
_mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(b + j))));
__m256 c_vec = _mm256_cvtph_ps(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
_mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(a + j))),
_mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(b + j))));
__m256 c_vec = _mm256_cvtph_ps(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(diff_i_vec, partial_sum_vec, sum_vec);
Expand Down Expand Up @@ -493,8 +493,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const* a, simsi
__m256 a_vec = _mm256_set1_ps(simsimd_bf16_to_f32(a + i));
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j)));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
__m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(b + j)));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(b_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(a_vec, partial_sum_vec, sum_vec);
Expand Down Expand Up @@ -530,9 +530,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const* a, si
__m256 partial_sum_vec = _mm256_setzero_ps();
for (simsimd_size_t j = 0; j + 8 <= n; j += 8) {
__m256 diff_j_vec = _mm256_sub_ps( //
_simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(a + j))), //
_simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(b + j))));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_loadu_si128((__m128i const*)(c + i * n + j)));
_simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(a + j))), //
_simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(b + j))));
__m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const*)(c + i * n + j)));
partial_sum_vec = _mm256_fmadd_ps(diff_j_vec, c_vec, partial_sum_vec);
}
sum_vec = _mm256_fmadd_ps(diff_i_vec, partial_sum_vec, sum_vec);
Expand Down
Loading
Loading