From 4d8749d872ba805257e2ab22f8ea263fff211c53 Mon Sep 17 00:00:00 2001 From: Denis Yaroshevskiy Date: Thu, 9 May 2024 02:59:42 -0700 Subject: [PATCH] folly::findFixed (#2183) Summary: Pull Request resolved: https://github.com/facebook/folly/pull/2183 folly::findFixed - a utility for doing a linear search in up to 64 bytes very fast. Current version optimize 16 and 32 bytes very well - that's the ones I really need. Other are reasonably OK, also with experimentation we can maybe squeeze out more. UPD: I also extracted `folly::movemask` - this is generally useful, also even more niche. Reviewed By: dmm-fb Differential Revision: D56714024 fbshipit-source-id: 7b1962f7fc75b949125e3d6f01922f382b6431f2 --- CMakeLists.txt | 3 + folly/BUCK | 5 + folly/algorithm/simd/BUCK | 23 ++ folly/algorithm/simd/FindFixed.h | 308 ++++++++++++++++++ folly/algorithm/simd/Movemask.h | 147 +++++++++ folly/algorithm/simd/test/BUCK | 38 +++ .../simd/test/FindFixedBenchmark.cpp | 126 +++++++ folly/algorithm/simd/test/FindFixedTest.cpp | 138 ++++++++ folly/algorithm/simd/test/MovemaskTest.cpp | 107 ++++++ folly/detail/BUCK | 1 + folly/detail/SimdCharPlatform.h | 53 ++- 11 files changed, 917 insertions(+), 32 deletions(-) create mode 100644 folly/algorithm/simd/BUCK create mode 100644 folly/algorithm/simd/FindFixed.h create mode 100644 folly/algorithm/simd/Movemask.h create mode 100644 folly/algorithm/simd/test/BUCK create mode 100644 folly/algorithm/simd/test/FindFixedBenchmark.cpp create mode 100644 folly/algorithm/simd/test/FindFixedTest.cpp create mode 100644 folly/algorithm/simd/test/MovemaskTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 63516863780..b6f37ae7f6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -598,6 +598,9 @@ if (BUILD_TESTS OR BUILD_BENCHMARKS) apply_folly_compile_options_to_target(folly_test_support) folly_define_tests( + DIRECTORY algorithm/simd/test/ + TEST find_fixed_test SOURCES FindFixedTest.cpp + DIRECTORY chrono/test/ TEST chrono_conv_test WINDOWS_DISABLED SOURCES ConvTest.cpp diff --git a/folly/BUCK b/folly/BUCK index d020a3f135a..63ed7d960c9 100644 --- a/folly/BUCK +++ b/folly/BUCK @@ -523,6 +523,11 @@ cpp_library( ], ) +cpp_library( + name = "findFixed", + headers = ["FindFixed.h"], +) + cpp_library( name = "fingerprint", srcs = ["Fingerprint.cpp"], diff --git a/folly/algorithm/simd/BUCK b/folly/algorithm/simd/BUCK new file mode 100644 index 00000000000..c1057ab4e84 --- /dev/null +++ b/folly/algorithm/simd/BUCK @@ -0,0 +1,23 @@ +###################################################################### +# Libraries + +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") + +oncall("fbcode_entropy_wardens_folly") + +cpp_library( + name = "movemask", + headers = ["Movemask.h"], + exported_deps = [ + "//folly:portability", + ], +) + +cpp_library( + name = "findFixed", + headers = ["FindFixed.h"], + exported_deps = [ + ":movemask", + "//folly:portability", + ], +) diff --git a/folly/algorithm/simd/FindFixed.h b/folly/algorithm/simd/FindFixed.h new file mode 100644 index 00000000000..c76c8022341 --- /dev/null +++ b/folly/algorithm/simd/FindFixed.h @@ -0,0 +1,308 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if FOLLY_X64 +#include +#endif + +#if FOLLY_AARCH64 +#include +#endif + +namespace folly { + +namespace detail { + +// Note: using std::same_as will just be slower to compile than is_same_v +template +concept SimdFriendlyType = + (std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v); + +} // namespace detail + +template +concept FollyFindFixedSupportedType = detail::SimdFriendlyType || + (std::is_enum_v && detail::SimdFriendlyType>); + +/* + * # folly::findFixed + * + * A function to linear search in number of elements, known at compiled time. + * + * Example: + * std::vector v {1, 3, 1, 2}; + * std::span vspan(v.data(), 4); + * auto m0 = folly::findFixed(vspan, 3); // m0 == 1; + * auto m1 = folly::findFixed(vspan, 5); // m0 == std::nullopt; + * + * Supported types: + * any 8,16,32,64 bit integers + * enums + * + * Max supported size of the range is 64 bytes. + */ +template < + FollyFindFixedSupportedType T, + std::convertible_to U, + std::size_t N> +constexpr std::optional findFixed(std::span where, U x) + requires(sizeof(T) * N <= 64); + +// implementation --------------------------------------------------------- + +namespace find_fixed_detail { +template +std::optional findFixedCast(std::span& where, T x) { + std::span whereU{reinterpret_cast(where.data()), N}; + return findFixed(whereU, static_cast(x)); +} + +template +constexpr std::optional findFixedConstexpr( + std::span where, T x) { + std::size_t res = 0; + for (T e : where) { + if (e == x) { + return res; + } + ++res; + } + return std::nullopt; +} + +// clang just checks all elements one by one, without any vectorization. +// even for not very friendly to SIMD cases we could do better but for +// now only special powers of 2 were interesting. +template +std::optional findFixedLetTheCompilerDoIt( + std::span where, T x) { + // this get's unrolled by both clang and gcc. + // Experimenting with more complex ways of writing this code + // didn't yield any results. + return findFixedConstexpr(std::span(where), x); +} + +#if FOLLY_X64 +#if defined(__AVX2__) +constexpr std::size_t kMaxSimdRegister = 32; +#else +constexpr std::size_t kMaxSimdRegister = 16; +#endif +#elif FOLLY_AARCH64 +constexpr std::size_t kMaxSimdRegister = 16; +#else +constexpr std::size_t kMaxSimdRegister = 1; +#endif + +template +std::optional find8bytes(const T* from, T x); +template +std::optional find16bytes(const T* from, T x); +template +std::optional find32bytes(const T* from, T x); + +template +std::optional find2Overlaping(std::span where, T x); + +template +std::optional findSplitFirstRegister( + std::span where, T x); + +template +std::optional findFixedDispatch(std::span where, T x) { + constexpr std::size_t kNumBytes = N * sizeof(T); + + if constexpr (N == 0) { + return std::nullopt; + } else if constexpr (N <= 2 || kNumBytes < 8 || kMaxSimdRegister == 1) { + return findFixedLetTheCompilerDoIt(where, x); + } else if constexpr (kNumBytes == 8) { + return find8bytes(where.data(), x); + } else if constexpr (kNumBytes == 16) { + return find16bytes(where.data(), x); + } else if constexpr (kMaxSimdRegister >= 32 && kNumBytes == 32) { + return find32bytes(where.data(), x); + } else if constexpr (kMaxSimdRegister * 2 <= kNumBytes) { + return findSplitFirstRegister(where, x); + } else { + // we can maybe do one better here probably with either out of bounds + // loads or combined two register search but it's ok for now. + return find2Overlaping(where, x); + } +} + +template +std::optional find2Overlaping(std::span where, T x) { + constexpr std::size_t kRegSize = std::bit_floor(N); + + std::span firstOverlap(where.data(), kRegSize); + if (auto res = findFixed(firstOverlap, x)) { + return res; + } + + std::span secondOverlap( + where.data() + (N - kRegSize), kRegSize); + if (auto res = findFixed(secondOverlap, x)) { + return *res + (N - kRegSize); + } + return std::nullopt; +} + +template +std::optional findSplitFirstRegister( + std::span where, T x) { + constexpr std::size_t kRegSize = kMaxSimdRegister / sizeof(T); + + std::span head(where.data(), kRegSize); + if (auto res = findFixed(head, x)) { + return res; + } + + std::span tail(where.data() + kRegSize, N - kRegSize); + if (auto res = findFixed(tail, x)) { + return *res + kRegSize; + } + return std::nullopt; +} + +template +std::optional firstTrue(Reg reg) { + auto [bits, bitsPerElement] = folly::movemask(reg); + if (bits) { + return std::countr_zero(bits) / bitsPerElement(); + } + return std::nullopt; +} + +#if FOLLY_X64 + +template +std::optional find16ByteReg(__m128i reg, T x) { + if constexpr (sizeof(T) == 1) { + return firstTrue(_mm_cmpeq_epi8(reg, _mm_set1_epi8(x))); + } else if constexpr (sizeof(T) == 2) { + return firstTrue(_mm_cmpeq_epi16(reg, _mm_set1_epi16(x))); + } else if constexpr (sizeof(T) == 4) { + return firstTrue(_mm_cmpeq_epi32(reg, _mm_set1_epi32(x))); + } +} + +template +std::optional find8bytes(const T* from, T x) { + std::uint64_t reg; + std::memcpy(®, from, 8); + return find16ByteReg(_mm_set1_epi64x(reg), x); +} + +template +std::optional find16bytes(const T* from, T x) { + __m128i reg = _mm_loadu_si128(reinterpret_cast(from)); + return find16ByteReg(reg, x); +} + +#if defined(__AVX2__) +template +std::optional find32ByteReg(__m256i reg, T x) { + if constexpr (sizeof(T) == 1) { + return firstTrue(_mm256_cmpeq_epi8(reg, _mm256_set1_epi8(x))); + } else if constexpr (sizeof(T) == 2) { + return firstTrue(_mm256_cmpeq_epi16(reg, _mm256_set1_epi16(x))); + } else if constexpr (sizeof(T) == 4) { + return firstTrue(_mm256_cmpeq_epi32(reg, _mm256_set1_epi32(x))); + } else if constexpr (sizeof(T) == 8) { + return firstTrue(_mm256_cmpeq_epi64(reg, _mm256_set1_epi64x(x))); + } +} + +template +std::optional find32bytes(const T* from, T x) { + __m256i reg = _mm256_loadu_si256(reinterpret_cast(from)); + return find32ByteReg(reg, x); +} + +#endif +#endif + +#if FOLLY_AARCH64 + +template +std::optional find8bytes(const T* from, T x) { + if constexpr (std::same_as) { + return firstTrue(vceq_u8(vld1_u8(from), vdup_n_u8(x))); + } else if constexpr (std::same_as) { + return firstTrue(vceq_u16(vld1_u16(from), vdup_n_u16(x))); + } else { + return firstTrue(vceq_u32(vld1_u32(from), vdup_n_u32(x))); + } +} + +template +std::optional find16bytes(const T* from, T x) { + if constexpr (std::same_as) { + return firstTrue(vceqq_u8(vld1q_u8(from), vdupq_n_u8(x))); + } else if constexpr (std::same_as) { + return firstTrue(vceqq_u16(vld1q_u16(from), vdupq_n_u16(x))); + } else if constexpr (std::same_as) { + return firstTrue(vceqq_u32(vld1q_u32(from), vdupq_n_u32(x))); + } else { + return firstTrue(vceqq_u64(vld1q_u64(from), vdupq_n_u64(x))); + } +} + +#endif + +} // namespace find_fixed_detail + +template < + FollyFindFixedSupportedType T, + std::convertible_to U, + std::size_t N> +constexpr std::optional findFixed(std::span where, U x) + requires(sizeof(T) * N <= 64) +{ + if constexpr (!std::is_same_v) { + return findFixed(where, static_cast(x)); + } else if (std::is_constant_evaluated()) { + return find_fixed_detail::findFixedConstexpr(std::span(where), x); + } else if constexpr (std::is_enum_v) { + return find_fixed_detail::findFixedCast>( + where, x); + } else if constexpr (std::is_signed_v) { + return find_fixed_detail::findFixedCast>(where, x); + } else { + return find_fixed_detail::findFixedDispatch(where, x); + } +} + +} // namespace folly diff --git a/folly/algorithm/simd/Movemask.h b/folly/algorithm/simd/Movemask.h new file mode 100644 index 00000000000..1487304e82c --- /dev/null +++ b/folly/algorithm/simd/Movemask.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#if FOLLY_X64 +#include +#endif + +#if FOLLY_AARCH64 +#include +#endif + +FOLLY_PUSH_WARNING +FOLLY_GCC_DISABLE_WARNING("-Wignored-attributes") + +namespace folly { + +/* + * This is a low level utility used for simd search algorithms. + * At the moment used in folly::findFixed and folly::split. + * + * Logical extension of _mm_movemask_epi8 for different types + * for both x86 and arm. + * + * Interface looks like this: + * folly::movemask<-scalar type->(nativeRegister) + * -> std::pair; + * + * Bits - unsigned integral, containing the bitmask (first is lowest bit). + * BitsPerElement - std::integral_constant with number of bits per element + * + * Example: + * + * std::optional firstTrueUint16(auto simdRegister) { + * auto [bits, bitsPerElement] = + * folly::movemask(simdRegister); + * if (!bits) { + * return std::nullopt; + * } + * return std::countl_zero(bits) / bitsPerElement(); + * } + * + * Arm implementation is based on: + * https://github.com/jfalcou/eve/blob/a2e2cf539e36e9a3326800194ad5206a8ef3f5b7/include/eve/detail/function/simd/arm/neon/movemask.hpp#L48 + * + */ + +#if FOLLY_X64 + +template +auto movemask(Reg reg) { + std::integral_constant + bitsPerElement; + auto mmask = static_cast([&] { + if constexpr (std::is_same_v) { + if constexpr (sizeof(Scalar) <= 2) { + return _mm_movemask_epi8(reg); + } else if constexpr (sizeof(Scalar) == 4) { + return _mm_movemask_ps(_mm_castsi128_ps(reg)); + } else if constexpr (sizeof(Scalar) == 8) { + return _mm_movemask_pd(_mm_castsi128_pd(reg)); + } + } +#if defined(__AVX2__) + else if constexpr (std::is_same_v) { + if constexpr (sizeof(Scalar) <= 2) { + return _mm256_movemask_epi8(reg); + } else if constexpr (sizeof(Scalar) == 4) { + return _mm256_movemask_ps(_mm256_castsi256_ps(reg)); + } else if constexpr (sizeof(Scalar) == 8) { + return _mm256_movemask_pd(_mm256_castsi256_pd(reg)); + } + } +#endif + }()); + return std::pair{mmask, bitsPerElement}; +} + +#endif + +#if FOLLY_AARCH64 + +namespace detail { + +inline auto movemaskChars16Aarch64(uint8x16_t reg) { + uint16x8_t u16s = vreinterpretq_u16_u8(reg); + u16s = vshrq_n_u16(u16s, 4); + uint8x8_t packed = vmovn_u16(u16s); + std::uint64_t bits = vget_lane_u64(vreinterpret_u64_u8(packed), 0); + return std::pair{bits, std::integral_constant{}}; +} + +template +uint64x1_t asUint64x1Aarch64(Reg reg) { + if constexpr (std::is_same_v) { + return vreinterpret_u64_u32(reg); + } else if constexpr (std::is_same_v) { + return vreinterpret_u64_u16(reg); + } else { + return vreinterpret_u64_u8(reg); + } +} + +} // namespace detail + +template +auto movemask(Reg reg) { + if constexpr (std::is_same_v) { + return movemask(vmovn_u64(reg)); + } else if constexpr (std::is_same_v) { + return movemask(vmovn_u32(reg)); + } else if constexpr (std::is_same_v) { + return movemask(vmovn_u16(reg)); + } else if constexpr (std::is_same_v) { + return detail::movemaskChars16Aarch64(reg); + } else { + std::uint64_t mmask = vget_lane_u64(detail::asUint64x1Aarch64(reg), 0); + return std::pair{ + mmask, std::integral_constant{}}; + } +} + +#endif + +} // namespace folly + +FOLLY_POP_WARNING diff --git a/folly/algorithm/simd/test/BUCK b/folly/algorithm/simd/test/BUCK new file mode 100644 index 00000000000..8df0df3e2b4 --- /dev/null +++ b/folly/algorithm/simd/test/BUCK @@ -0,0 +1,38 @@ +load("@fbcode_macros//build_defs:cpp_benchmark.bzl", "cpp_benchmark") +load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") + +oncall("fbcode_entropy_wardens_folly") + +cpp_unittest( + name = "momemask_test", + srcs = ["MovemaskTest.cpp"], + headers = [], + deps = [ + "//folly:portability", + "//folly/algorithm/simd:movemask", + "//folly/portability:gtest", + ], +) + +cpp_unittest( + name = "findfixed_test", + srcs = ["FindFixedTest.cpp"], + headers = [], + deps = [ + "fbsource//third-party/fmt:fmt", + "//folly:portability", + "//folly/algorithm/simd:findFixed", + "//folly/portability:gtest", + ], +) + +cpp_benchmark( + name = "findfixed_bench", + srcs = ["FindFixedBenchmark.cpp"], + deps = [ + "fbsource//third-party/fmt:fmt", + "//folly:benchmark", + "//folly/algorithm/simd:findFixed", + "//folly/init:init", + ], +) diff --git a/folly/algorithm/simd/test/FindFixedBenchmark.cpp b/folly/algorithm/simd/test/FindFixedBenchmark.cpp new file mode 100644 index 00000000000..3800795fabf --- /dev/null +++ b/folly/algorithm/simd/test/FindFixedBenchmark.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace folly { +namespace { + +template +const auto kInput = [] { + std::vector res(N, T{0}); + std::iota(res.begin(), res.end(), T{0}); + return res; +}(); + +template +using IndexConstant = std::integral_constant; + +template +void stdFindBenchmark(unsigned n) { + const auto& in = kInput; + + while (n--) { + for (auto x : in) { + folly::doNotOptimizeAway(std::ranges::find(in, x)); + } + } +} + +template +void follyFindFixedBenchmark(unsigned n) { + const auto& in = kInput; + + while (n--) { + for (auto x : in) { + std::span s(in.data(), N); + folly::doNotOptimizeAway(folly::findFixed(s, x)); + } + } +} + +template +void registerBenchmark(T, IndexConstant) { + (void)kInput; + + folly::addBenchmark( + __FILE__, + fmt::format( + "total size:{}, sizeof(T):{} std::find", N * sizeof(T), sizeof(T)), + [](unsigned n) -> unsigned { + stdFindBenchmark(n); + return n; + }); + folly::addBenchmark( + __FILE__, + fmt::format( + "total size:{}, sizeof(T):{} folly::findFixed", + N * sizeof(T), + sizeof(T)), + [](unsigned n) -> unsigned { + follyFindFixedBenchmark(n); + return n; + }); +} + +void drawLine() { + folly::addBenchmark(__FILE__, "-", []() -> unsigned { return 0; }); +} + +void registerAllBenchmarks() { + // 8 bytes + registerBenchmark(std::int8_t{}, IndexConstant<8>{}); + registerBenchmark(std::int16_t{}, IndexConstant<4>{}); + registerBenchmark(std::int32_t{}, IndexConstant<2>{}); + drawLine(); + // 16 bytes + registerBenchmark(std::int8_t{}, IndexConstant<16>{}); + registerBenchmark(std::int16_t{}, IndexConstant<8>{}); + registerBenchmark(std::int32_t{}, IndexConstant<4>{}); + registerBenchmark(std::int64_t{}, IndexConstant<2>{}); + drawLine(); + // 32 bytes + registerBenchmark(std::int8_t{}, IndexConstant<32>{}); + registerBenchmark(std::int16_t{}, IndexConstant<16>{}); + registerBenchmark(std::int32_t{}, IndexConstant<8>{}); + registerBenchmark(std::int64_t{}, IndexConstant<4>{}); + drawLine(); + // 40 bytes + registerBenchmark(std::int8_t{}, IndexConstant<40>{}); + registerBenchmark(std::int16_t{}, IndexConstant<20>{}); + registerBenchmark(std::int32_t{}, IndexConstant<10>{}); + registerBenchmark(std::int64_t{}, IndexConstant<5>{}); +} + +} // namespace +} // namespace folly + +int main(int argc, char** argv) { + folly::Init init(&argc, &argv, true); + folly::registerAllBenchmarks(); + folly::runBenchmarks(); +} diff --git a/folly/algorithm/simd/test/FindFixedTest.cpp b/folly/algorithm/simd/test/FindFixedTest.cpp new file mode 100644 index 00000000000..7628051ef5b --- /dev/null +++ b/folly/algorithm/simd/test/FindFixedTest.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#ifdef __cpp_lib_concepts // these tests need C++ concepts + +#include + +#include +#include + +#include + +#include +#include +#include +#include + +namespace { + +template +void allTestsForN(std::span buf) { + auto errorMsg = [&] { + return fmt::format("looking in: {}, sizeof(T): {}", buf, sizeof(T)); + }; + + T foundX = static_cast(0); + T notFoundX = static_cast(1); + + std::fill_n(buf.begin(), N, foundX); + std::span cbuf = buf; + ASSERT_EQ(std::nullopt, folly::findFixed(cbuf, notFoundX)) << errorMsg(); + if (N == 0) { + return; + } + + ASSERT_EQ(0, folly::findFixed(cbuf, foundX)) << errorMsg(); + for (std::size_t found = 1; found < N; ++found) { + buf[found - 1] = notFoundX; + ASSERT_EQ(found, folly::findFixed(cbuf, foundX)) << errorMsg(); + } +} + +template +void allTestsForImpl(std::span buf, std::index_sequence) { + (allTestsForN(std::span(buf.data(), idx)), ...); +} + +template +void allTestsFor() { + constexpr std::size_t kMaxSize = 64 / sizeof(T); + std::vector buf; + buf.resize(2 * kMaxSize, static_cast(0)); + + // simd code can depend on alignment, so we better test it + for (std::size_t offset = 0; offset != kMaxSize; ++offset) { + ASSERT_NO_FATAL_FAILURE(allTestsForImpl( + std::span(buf.data() + offset, kMaxSize), + std::make_index_sequence{})) + << offset; + } +} + +} // namespace + +TEST(FindFixed, Basic) { + // Int is an important case, it should work + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + ASSERT_NO_FATAL_FAILURE(allTestsFor()); + + // just a enum test + ASSERT_NO_FATAL_FAILURE(allTestsFor()); +} + +TEST(FindFixed, Interfaces) { + // constexpr + { + constexpr std::array arr{1, 2, 3}; + static_assert(folly::findFixed(std::span(std::as_const(arr)), 1) == 0); + static_assert(folly::findFixed(std::span(std::as_const(arr)), 3) == 2); + static_assert( + folly::findFixed(std::span(std::as_const(arr)), 4) == std::nullopt); + } + + // array + { + std::array arr{1, 2, 3}; + ASSERT_EQ(std::nullopt, folly::findFixed(std::span(std::as_const(arr)), 0)); + ASSERT_EQ(1, folly::findFixed(std::span(std::as_const(arr)), 2)); + } + + // mutable span + { + std::array arr{1, 2, 3}; + std::span s(arr); + ASSERT_EQ(std::nullopt, folly::findFixed(std::span(s), 0)); + ASSERT_EQ(1, folly::findFixed(std::span(s), 2)); + } +} + +template +concept findFixedWorksFor = requires(const T& x) { + { folly::findFixed(x) }; +}; + +TEST(FollyFindFixed, SfianeFriendlyUnsupportedTypes) { + EXPECT_FALSE(findFixedWorksFor>) + << "dynamic extend is not supported"; + EXPECT_FALSE(findFixedWorksFor>) + << "vector is dynamic size by definition"; + EXPECT_FALSE((findFixedWorksFor, 3>>)) + << "find fixed works only for some trivial types."; +} + +#endif diff --git a/folly/algorithm/simd/test/MovemaskTest.cpp b/folly/algorithm/simd/test/MovemaskTest.cpp new file mode 100644 index 00000000000..b97fb13630b --- /dev/null +++ b/folly/algorithm/simd/test/MovemaskTest.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include + +#if FOLLY_X64 +#include +#endif + +#if FOLLY_AARCH64 +#include +#endif + +template +Reg loadReg(const std::array& arr) { + Reg res; + std::memcpy(&res, arr.data(), sizeof(T) * N); + return res; +} + +std::uint64_t safeShift(std::uint64_t what, std::uint32_t shift) { + if (!shift) { + return what; + } + what <<= shift - 1; + what <<= 1; + return what; +} + +template +void allOneTrueTests() { + constexpr auto kTrue = static_cast(-1); + constexpr auto kFalse = static_cast(0); + + std::array arr; + arr.fill(kFalse); + + ASSERT_EQ(0, folly::movemask(loadReg(arr)).first); + + for (std::size_t i = 0; i != N; ++i) { + arr[i] = kTrue; + auto [bits, bitsPerElement] = folly::movemask(loadReg(arr)); + std::uint64_t oneElement = safeShift(1, bitsPerElement()) - 1; + std::uint64_t expectedBits = safeShift(oneElement, i * bitsPerElement()); + + ASSERT_EQ(expectedBits, bits) << "sizeof(T): " << sizeof(T) << " i: " << i; + arr[i] = kFalse; + } +} + +#if FOLLY_X64 + +TEST(Movemask, Sse2) { + allOneTrueTests<__m128i, std::uint8_t, 16>(); + allOneTrueTests<__m128i, std::uint16_t, 8>(); + allOneTrueTests<__m128i, std::uint32_t, 4>(); + allOneTrueTests<__m128i, std::uint64_t, 2>(); +} + +#if defined(__AVX2__) + +TEST(Movemask, Avx2) { + allOneTrueTests<__m256i, std::uint8_t, 32>(); + allOneTrueTests<__m256i, std::uint16_t, 16>(); + allOneTrueTests<__m256i, std::uint32_t, 8>(); + allOneTrueTests<__m256i, std::uint64_t, 4>(); +} + +#endif + +#endif + +#if FOLLY_AARCH64 + +TEST(Movemask, AARCH64) { + allOneTrueTests(); + allOneTrueTests(); + allOneTrueTests(); + allOneTrueTests(); + + allOneTrueTests(); + allOneTrueTests(); + allOneTrueTests(); + allOneTrueTests(); +} + +#endif diff --git a/folly/detail/BUCK b/folly/detail/BUCK index 500461b500b..c696b52f051 100644 --- a/folly/detail/BUCK +++ b/folly/detail/BUCK @@ -246,6 +246,7 @@ cpp_library( exported_deps = [ ":simd_for_each", "//folly:portability", + "//folly/algorithm/simd:movemask", "//folly/lang:bits", ], ) diff --git a/folly/detail/SimdCharPlatform.h b/folly/detail/SimdCharPlatform.h index dbf2118e7ad..dae27fe76c0 100644 --- a/folly/detail/SimdCharPlatform.h +++ b/folly/detail/SimdCharPlatform.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -76,9 +77,16 @@ namespace simd_detail { * */ +#if FOLLY_X64 || FOLLY_AARCH64 + template struct SimdCharPlatformCommon : Platform { - using mmask_t = typename Platform::mmask_t; + using logical_t = typename Platform::logical_t; + using movemask_result_t = + decltype(folly::movemask(logical_t{})); + using mmask_t = typename movemask_result_t::first_type; + static constexpr std::uint32_t kMmaskBitsPerElement = + typename movemask_result_t::second_type{}(); template FOLLY_NODISCARD FOLLY_ALWAYS_INLINE static Uint setLowerNBits(int n) { @@ -91,9 +99,9 @@ struct SimdCharPlatformCommon : Platform { FOLLY_NODISCARD FOLLY_ALWAYS_INLINE static mmask_t clear( mmask_t mmask, simd_detail::ignore_extrema ignore) { mmask_t clearFirst = - ~setLowerNBits(ignore.first * Platform::kMmaskBitsPerElement); + ~setLowerNBits(ignore.first * kMmaskBitsPerElement); mmask_t clearLast = setLowerNBits( - (Platform::kCardinal - ignore.last) * Platform::kMmaskBitsPerElement); + (Platform::kCardinal - ignore.last) * kMmaskBitsPerElement); return mmask & clearFirst & clearLast; } @@ -114,12 +122,17 @@ struct SimdCharPlatformCommon : Platform { return Platform::unsafeLoadu(ptr, simd_detail::ignore_none{}); } + FOLLY_ALWAYS_INLINE + static mmask_t movemask(logical_t log) { + return folly::movemask(log).first; + } + using Platform::any; FOLLY_ALWAYS_INLINE static bool any( typename Platform::logical_t log, simd_detail::ignore_extrema ignore) { - auto mmask = Platform::movemask(log); + auto mmask = movemask(log); mmask = clear(mmask, ignore); return mmask; } @@ -131,15 +144,15 @@ struct SimdCharPlatformCommon : Platform { } }; +#endif + #if FOLLY_X64 struct SimdCharSse2PlatformSpecific { using reg_t = __m128i; using logical_t = reg_t; - using mmask_t = std::uint16_t; static constexpr int kCardinal = 16; - static constexpr int kMmaskBitsPerElement = 1; // Even for aligned loads intel people don't recommend using // aligned load instruction @@ -172,12 +185,9 @@ struct SimdCharSse2PlatformSpecific { return _mm_or_si128(x, y); } - FOLLY_ALWAYS_INLINE - static mmask_t movemask(logical_t log) { return _mm_movemask_epi8(log); } - FOLLY_ALWAYS_INLINE static bool any(logical_t log, simd_detail::ignore_none) { - return movemask(log); + return folly::movemask(log).first; } }; @@ -191,10 +201,8 @@ using SimdCharSse2Platform = struct SimdCharAvx2PlatformSpecific { using reg_t = __m256i; using logical_t = reg_t; - using mmask_t = std::uint32_t; static constexpr int kCardinal = 32; - static constexpr int kMmaskBitsPerElement = 1; // We can actually use aligned loads but our Intel people don't recommend FOLLY_ALWAYS_INLINE @@ -225,12 +233,9 @@ struct SimdCharAvx2PlatformSpecific { return _mm256_or_si256(x, y); } - FOLLY_ALWAYS_INLINE - static mmask_t movemask(logical_t log) { return _mm256_movemask_epi8(log); } - FOLLY_ALWAYS_INLINE static bool any(logical_t log, simd_detail::ignore_none) { - return movemask(log); + return folly::movemask(log).first; } }; @@ -248,10 +253,8 @@ using SimdCharPlatform = SimdCharSse2Platform; struct SimdCharAarch64PlatformSpecific { using reg_t = uint8x16_t; using logical_t = reg_t; - using mmask_t = std::uint64_t; static constexpr int kCardinal = 16; - static constexpr int kMmaskBitsPerElement = 4; FOLLY_ALWAYS_INLINE static reg_t loadu(const char* p, simd_detail::ignore_none) { @@ -279,20 +282,6 @@ struct SimdCharAarch64PlatformSpecific { return vorrq_u8(x, y); } - FOLLY_ALWAYS_INLINE - static mmask_t movemask(logical_t log) { - // note: we tried doing any before movemask and it didn't help - // if you need movemask - do movemask. - // - // based on: - // https://github.com/jfalcou/eve/blob/5264e20c51aeca17675e67abf236ce1ead781c52/include/eve/detail/function/simd/arm/neon/movemask.hpp#L119 - // pack 4 bits into uint64 - uint16x8_t u16s = vreinterpretq_u16_u8(log); - u16s = vshrq_n_u16(u16s, 4); - uint8x8_t packed = vmovn_u16(u16s); - return vget_lane_u64(vreinterpret_u64_u8(packed), 0); - } - FOLLY_ALWAYS_INLINE static bool any(logical_t log, simd_detail::ignore_none) { return vmaxvq_u8(log);