Skip to content

Commit

Permalink
Fix _mm_test_mix_ones_zeros and _mm_testnzc_si128 (#621)
Browse files Browse the repository at this point in the history
One error is the use of a logical NOT (`!a[0]`) was instead of a bitwise
NOT (`~_a[0]`).
Bug: `_mm_test_mix_ones_zeros` always returned true.
The function wasn't reducing `zf` and `cf` to a bool before combining them.

The fix proposed here isn't the most efficient, but at least it is correct. 

NOTE:                                                                                                                                                         
* The arguments are named incorrectly in the `_mm_test_mix_ones_zeros`
  documentation[0].
* The second argument is the mask, as per the behavior of `_mm_test_mix_ones_zeros`
  with gcc and clang.
* This naming error seems to have propagated through both gcc[1] and
  llvm[2] headers but not to rust[3] headers.

[0] https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=ptest&techs=SSE_ALL&ig_expand=6902
[1] https://github.com/gcc-mirror/gcc/blob/27ce74fa23c93c1189c301993cd19ea766e6bdb5/gcc/config/i386/smmintrin.h#L94
[2] https://github.com/llvm/llvm-project/blob/70535f5e609f747c28cfef699eefb84581b0aac0/clang/lib/Headers/smmintrin.h#L1130
[3] https://github.com/rust-lang/stdarch/blob/f4528dd6e85d97bb802240d7cd048b6e1bf72540/crates/core_arch/src/x86/sse41.rs#L1149
  • Loading branch information
aqrit authored Dec 10, 2023
1 parent 243e90f commit e5c5eff
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 12 deletions.
20 changes: 14 additions & 6 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -7592,14 +7592,22 @@ FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
// otherwise return 0.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
// Note: Argument names may be wrong in the Intel intrinsics guide.
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
{
uint64x2_t zf =
vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
uint64x2_t cf =
vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
uint64x2_t result = vandq_u64(zf, cf);
return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
uint64x2_t v = vreinterpretq_u64_m128i(a);
uint64x2_t m = vreinterpretq_u64_m128i(mask);

// find ones (set-bits) and zeros (clear-bits) under clip mask
uint64x2_t ones = vandq_u64(m, v);
uint64x2_t zeros = vbicq_u64(m, v);

// If both 128-bit variables are populated (non-zero) then return 1.
// For comparision purposes, first compact each var down to 32-bits.
uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));

// if folding minimum is non-zero then both vars must be non-zero
return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
}

// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
Expand Down
13 changes: 7 additions & 6 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9228,14 +9228,15 @@ result_t test_mm_test_mix_ones_zeros(const SSE2NEONTestImpl &impl,
__m128i a = load_m128i(_a);
__m128i mask = load_m128i(_mask);

int32_t d0 = !((_a[0]) & _mask[0]) & !((!_a[0]) & _mask[0]);
int32_t d1 = !((_a[1]) & _mask[1]) & !((!_a[1]) & _mask[1]);
int32_t d2 = !((_a[2]) & _mask[2]) & !((!_a[2]) & _mask[2]);
int32_t d3 = !((_a[3]) & _mask[3]) & !((!_a[3]) & _mask[3]);
int32_t result = ((d0 & d1 & d2 & d3) == 0) ? 1 : 0;
int32_t ZF = 1;
int32_t CF = 1;
for (int i = 0; i < 4; i++) {
ZF &= ((_a[i] & _mask[i]) == 0);
CF &= ((~_a[i] & _mask[i]) == 0);
}
int32_t result = (ZF == 0 && CF == 0);

int32_t ret = _mm_test_mix_ones_zeros(a, mask);

return result == ret ? TEST_SUCCESS : TEST_FAIL;
}

Expand Down

0 comments on commit e5c5eff

Please sign in to comment.