From ac965c8c77ecb638d37c38b438df02b76fe9d93b Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Fri, 6 Dec 2024 12:08:52 +0000 Subject: [PATCH] Add NEON implementation of validate Nothing fancy, just processing 4 keys per vector. In both micro- and macro-benchmarks this performs at basically the same speed as the loop added in the previous commit (tested with clang), both of which are significantly faster than the original version. --- src/streamvbyte_arm_decode.c | 28 ++++++++++++++++++++++++++++ src/streamvbyte_decode.c | 4 ++++ 2 files changed, 32 insertions(+) diff --git a/src/streamvbyte_arm_decode.c b/src/streamvbyte_arm_decode.c index 02caaba..7480255 100644 --- a/src/streamvbyte_arm_decode.c +++ b/src/streamvbyte_arm_decode.c @@ -51,4 +51,32 @@ static const uint8_t *svb_decode_vector(uint32_t *out, const uint8_t *keyPtr, co return dataPtr; } + +static uint64_t svb_validate_vector(const uint8_t **keyPtrPtr, + uint32_t *countPtr) { + // Reduce the count by how many we'll process + const uint32_t count = *countPtr & ~7U; + const uint8_t *keyPtr = *keyPtrPtr; + *countPtr &= 7; + *keyPtrPtr += count / 4; + + // Deal with each of the 4 keys in a separate lane + const int32x4_t shifts = {0, -2, -4, -6}; + const uint32x4_t mask = vdupq_n_u32(3); + uint32x4_t acc0 = vdupq_n_u32(0); + uint32x4_t acc1 = vdupq_n_u32(0); + + // Unrolling more than twice doesn't seem to improve performance + for (uint32_t c = 0; c < count; c += 8) { + uint32x4_t shifted0 = vshlq_u32(vdupq_n_u32(*keyPtr++), shifts); + acc0 = vaddq_u32(acc0, vandq_u32(shifted0, mask)); + uint32x4_t shifted1 = vshlq_u32(vdupq_n_u32(*keyPtr++), shifts); + acc1 = vaddq_u32(acc1, vandq_u32(shifted1, mask)); + } + + // Accumulate the sums and add the +1 for each element (count) + uint64x2_t sum0 = vpaddlq_u32(acc0); + uint64x2_t sum1 = vpaddlq_u32(acc1); + return sum0[0] + sum0[1] + sum1[0] + sum1[1] + count; +} #endif diff --git a/src/streamvbyte_decode.c b/src/streamvbyte_decode.c index 0061c49..b99c7e3 100644 --- a/src/streamvbyte_decode.c +++ b/src/streamvbyte_decode.c @@ -105,6 +105,10 @@ bool streamvbyte_validate_stream(const uint8_t *in, size_t inCount, const uint8_t *keyPtr = in; uint64_t encodedSize = 0; +#if defined(__ARM_NEON__) + encodedSize = svb_validate_vector(&keyPtr, &outCount); +#endif + // Give the compiler a hint that it can avoid branches in the inner loop for (uint32_t c = 0; c < outCount / 4; c++) { uint32_t key = *keyPtr++;