Skip to content

Commit

Permalink
Improve: Unsigned type literals for masks
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Nov 9, 2024
1 parent 463e8f3 commit e089626
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 15 deletions.
20 changes: 10 additions & 10 deletions include/simsimd/binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,14 +245,14 @@ SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c
simsimd_size_t xor_count;
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
if (n_words <= 64) { // Up to 512 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words);
__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a);
__m512i b_vec = _mm512_maskz_loadu_epi8(mask, b);
__m512i xor_count_vec = _mm512_popcnt_epi64(_mm512_xor_si512(a_vec, b_vec));
xor_count = _mm512_reduce_add_epi64(xor_count_vec);
}
else if (n_words <= 128) { // Up to 1024 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 64);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words - 64);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_maskz_loadu_epi8(mask, a + 64);
Expand All @@ -262,7 +262,7 @@ SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c
xor_count = _mm512_reduce_add_epi64(_mm512_add_epi64(xor2_count_vec, xor1_count_vec));
}
else if (n_words <= 196) { // Up to 1568 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words - 128);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
Expand All @@ -276,7 +276,7 @@ SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c
_mm512_reduce_add_epi64(_mm512_add_epi64(xor3_count_vec, _mm512_add_epi64(xor2_count_vec, xor1_count_vec)));
}
else if (n_words <= 256) { // Up to 2048 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 192);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words - 192);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
Expand All @@ -298,7 +298,7 @@ SIMSIMD_PUBLIC void simsimd_hamming_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c

simsimd_hamming_b8_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words);
a_vec = _mm512_maskz_loadu_epi8(mask, a);
b_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
Expand Down Expand Up @@ -336,7 +336,7 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c
//
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
if (n_words <= 64) { // Up to 512 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words);
__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a);
__m512i b_vec = _mm512_maskz_loadu_epi8(mask, b);
__m512i and_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a_vec, b_vec));
Expand All @@ -345,7 +345,7 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c
union_ = _mm512_reduce_add_epi64(or_count_vec);
}
else if (n_words <= 128) { // Up to 1024 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 64);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words - 64);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_maskz_loadu_epi8(mask, a + 64);
Expand All @@ -358,7 +358,7 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c
union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(or2_count_vec, or1_count_vec));
}
else if (n_words <= 196) { // Up to 1568 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words - 128);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
Expand All @@ -377,7 +377,7 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c
_mm512_add_epi64(or3_count_vec, _mm512_add_epi64(or2_count_vec, or1_count_vec)));
}
else if (n_words <= 256) { // Up to 2048 bits.
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 192);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words - 192);
__m512i a1_vec = _mm512_loadu_epi8(a);
__m512i b1_vec = _mm512_loadu_epi8(b);
__m512i a2_vec = _mm512_loadu_epi8(a + 64);
Expand Down Expand Up @@ -405,7 +405,7 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const *a, simsimd_b8_t c

simsimd_jaccard_b8_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words);
a_vec = _mm512_maskz_loadu_epi8(mask, a);
b_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
Expand Down
2 changes: 1 addition & 1 deletion include/simsimd/dot.h
Original file line number Diff line number Diff line change
Expand Up @@ -1792,7 +1792,7 @@ SIMSIMD_PUBLIC void simsimd_dot_u8_ice(simsimd_u8_t const *a, simsimd_u8_t const

simsimd_dot_u8_ice_cycle:
if (n < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
Expand Down
8 changes: 4 additions & 4 deletions include/simsimd/spatial.h
Original file line number Diff line number Diff line change
Expand Up @@ -1975,7 +1975,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_u8_ice(simsimd_u8_t const *a, simsimd_u8_t cons

simsimd_l2sq_u8_ice_cycle:
if (n < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
Expand Down Expand Up @@ -2014,7 +2014,7 @@ SIMSIMD_PUBLIC void simsimd_cos_u8_ice(simsimd_u8_t const *a, simsimd_u8_t const

simsimd_cos_u8_ice_cycle:
if (n < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n);
a_u8_vec = _mm512_maskz_loadu_epi8(mask, a);
b_u8_vec = _mm512_maskz_loadu_epi8(mask, b);
n = 0;
Expand Down Expand Up @@ -2085,7 +2085,7 @@ SIMSIMD_PUBLIC void simsimd_l2sq_i4x2_ice(simsimd_i4x2_t const *a, simsimd_i4x2_

simsimd_l2sq_i4x2_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words);
a_i4x2_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i4x2_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
Expand Down Expand Up @@ -2199,7 +2199,7 @@ SIMSIMD_PUBLIC void simsimd_cos_i4x2_ice(simsimd_i4x2_t const *a, simsimd_i4x2_t

simsimd_cos_i4x2_ice_cycle:
if (n_words < 64) {
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n_words);
a_i4x2_vec = _mm512_maskz_loadu_epi8(mask, a);
b_i4x2_vec = _mm512_maskz_loadu_epi8(mask, b);
n_words = 0;
Expand Down

0 comments on commit e089626

Please sign in to comment.