Skip to content

Commit

Permalink
Fix: Correct and branchless masking in AVX-512
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Oct 3, 2023
1 parent 2e43ab7 commit 0a32408
Showing 1 changed file with 3 additions and 6 deletions.
9 changes: 3 additions & 6 deletions include/simsimd/x86_avx512_f16.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ static simsimd_f32_t simsimd_avx512_f16_l2sq(simsimd_f16_t const* a, simsimd_f16
simsimd_size_t i = 0;

do {
simsimd_size_t elements_to_process = d - i > 32 ? 32 : d - i;
__mmask32 mask = (1 << elements_to_process) - 1;
__mmask32 mask = (1u << (d - i)) - 1u;
__m512i a_vec = _mm512_maskz_loadu_epi16(mask, a + i);
__m512i b_vec = _mm512_maskz_loadu_epi16(mask, b + i);
__m512h sub_vec = _mm512_sub_ph(_mm512_castsi512_ph(a_vec), _mm512_castsi512_ph(b_vec));
Expand All @@ -39,8 +38,7 @@ static simsimd_f32_t simsimd_avx512_f16_ip(simsimd_f16_t const* a, simsimd_f16_t
simsimd_size_t i = 0;

do {
simsimd_size_t elements_to_process = d - i > 32 ? 32 : d - i;
__mmask32 mask = (1 << elements_to_process) - 1;
__mmask32 mask = (1u << (d - i)) - 1u;
__m512i a_vec = _mm512_maskz_loadu_epi16(mask, a + i);
__m512i b_vec = _mm512_maskz_loadu_epi16(mask, b + i);
ab_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_vec), _mm512_castsi512_ph(b_vec), ab_vec);
Expand All @@ -58,8 +56,7 @@ static simsimd_f32_t simsimd_avx512_f16_cos(simsimd_f16_t const* a, simsimd_f16_
simsimd_size_t i = 0;

do {
simsimd_size_t elements_to_process = d - i > 32 ? 32 : d - i;
__mmask32 mask = (1 << elements_to_process) - 1;
__mmask32 mask = (1u << (d - i)) - 1u;
__m512i a_vec = _mm512_maskz_loadu_epi16(mask, a + i);
__m512i b_vec = _mm512_maskz_loadu_epi16(mask, b + i);
ab_vec = _mm512_fmadd_ph(_mm512_castsi512_ph(a_vec), _mm512_castsi512_ph(b_vec), ab_vec);
Expand Down

0 comments on commit 0a32408

Please sign in to comment.