From ac965c8c77ecb638d37c38b438df02b76fe9d93b Mon Sep 17 00:00:00 2001
From: Ben Lawrence <ben.lawrence@nanoporetech.com>
Date: Fri, 6 Dec 2024 12:08:52 +0000
Subject: [PATCH] Add NEON implementation of validate

Nothing fancy, just processing 4 keys per vector. In both micro- and
macro-benchmarks this performs at basically the same speed as the loop
added in the previous commit (tested with clang), both of which are
significantly faster than the original version.
---
 src/streamvbyte_arm_decode.c | 28 ++++++++++++++++++++++++++++
 src/streamvbyte_decode.c     |  4 ++++
 2 files changed, 32 insertions(+)

diff --git a/src/streamvbyte_arm_decode.c b/src/streamvbyte_arm_decode.c
index 02caaba..7480255 100644
--- a/src/streamvbyte_arm_decode.c
+++ b/src/streamvbyte_arm_decode.c
@@ -51,4 +51,32 @@ static const uint8_t *svb_decode_vector(uint32_t *out, const uint8_t *keyPtr, co
 
   return dataPtr;
 }
+
+static uint64_t svb_validate_vector(const uint8_t **keyPtrPtr,
+                                    uint32_t *countPtr) {
+  // Reduce the count by how many we'll process
+  const uint32_t count = *countPtr & ~7U;
+  const uint8_t *keyPtr = *keyPtrPtr;
+  *countPtr &= 7;
+  *keyPtrPtr += count / 4;
+
+  // Deal with each of the 4 keys in a separate lane
+  const int32x4_t shifts = {0, -2, -4, -6};
+  const uint32x4_t mask = vdupq_n_u32(3);
+  uint32x4_t acc0 = vdupq_n_u32(0);
+  uint32x4_t acc1 = vdupq_n_u32(0);
+
+  // Unrolling more than twice doesn't seem to improve performance
+  for (uint32_t c = 0; c < count; c += 8) {
+    uint32x4_t shifted0 = vshlq_u32(vdupq_n_u32(*keyPtr++), shifts);
+    acc0 = vaddq_u32(acc0, vandq_u32(shifted0, mask));
+    uint32x4_t shifted1 = vshlq_u32(vdupq_n_u32(*keyPtr++), shifts);
+    acc1 = vaddq_u32(acc1, vandq_u32(shifted1, mask));
+  }
+
+  // Accumulate the sums and add the +1 for each element (count)
+  uint64x2_t sum0 = vpaddlq_u32(acc0);
+  uint64x2_t sum1 = vpaddlq_u32(acc1);
+  return sum0[0] + sum0[1] + sum1[0] + sum1[1] + count;
+}
 #endif
diff --git a/src/streamvbyte_decode.c b/src/streamvbyte_decode.c
index 0061c49..b99c7e3 100644
--- a/src/streamvbyte_decode.c
+++ b/src/streamvbyte_decode.c
@@ -105,6 +105,10 @@ bool streamvbyte_validate_stream(const uint8_t *in, size_t inCount,
   const uint8_t *keyPtr = in;
   uint64_t encodedSize = 0;
 
+#if defined(__ARM_NEON__)
+  encodedSize = svb_validate_vector(&keyPtr, &outCount);
+#endif
+
   // Give the compiler a hint that it can avoid branches in the inner loop
   for (uint32_t c = 0; c < outCount / 4; c++) {
     uint32_t key = *keyPtr++;