From cffb1e79fd744a16eb32279752f95d04efe3596c Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 16 Nov 2024 20:22:53 +0200 Subject: [PATCH 1/5] unique vectorization --- benchmarks/CMakeLists.txt | 1 + benchmarks/src/unique.cpp | 47 ++++ stl/inc/algorithm | 74 +++++++ stl/src/vector_algorithms.cpp | 200 ++++++++++++++++++ .../VSO_0000000_vector_algorithms/test.cpp | 72 +++++++ 5 files changed, 394 insertions(+) create mode 100644 benchmarks/src/unique.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 4e4f47894e..852f1c147e 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -126,6 +126,7 @@ add_benchmark(search src/search.cpp) add_benchmark(std_copy src/std_copy.cpp) add_benchmark(sv_equal src/sv_equal.cpp) add_benchmark(swap_ranges src/swap_ranges.cpp) +add_benchmark(unique src/unique.cpp) add_benchmark(vector_bool_copy src/std/containers/sequences/vector.bool/copy/test.cpp) add_benchmark(vector_bool_copy_n src/std/containers/sequences/vector.bool/copy_n/test.cpp) diff --git a/benchmarks/src/unique.cpp b/benchmarks/src/unique.cpp new file mode 100644 index 0000000000..f12641c439 --- /dev/null +++ b/benchmarks/src/unique.cpp @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include + +#include "skewed_allocator.hpp" + +enum class alg_type { std_fn, rng }; + +template +void u(benchmark::State& state) { + std::mt19937_64 gen(22033); + using TD = std::conditional_t; + std::binomial_distribution dis(5); + + std::vector> src; + src.resize(2552); + std::generate(src.begin(), src.end(), [&] { return static_cast(dis(gen)); }); + + std::vector> v; + v.reserve(src.size()); + for (auto _ : state) { + v = src; + benchmark::DoNotOptimize(v); + if constexpr (Type == alg_type::std_fn) { + benchmark::DoNotOptimize(std::unique(v.begin(), v.end())); + } else { + benchmark::DoNotOptimize(std::ranges::unique(v)); + } + } +} + +BENCHMARK(u); +BENCHMARK(u); +BENCHMARK(u); +BENCHMARK(u); + +BENCHMARK(u); +BENCHMARK(u); +BENCHMARK(u); +BENCHMARK(u); + +BENCHMARK_MAIN(); diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 40f3f965d2..33a8e6740d 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -75,6 +75,11 @@ __declspec(noalias) void __stdcall __std_replace_4( void* _First, void* _Last, uint32_t _Old_val, uint32_t _New_val) noexcept; __declspec(noalias) void __stdcall __std_replace_8( void* _First, void* _Last, uint64_t _Old_val, uint64_t _New_val) noexcept; + +void* __stdcall __std_unique_1(void* _First, void* _Last) noexcept; +void* __stdcall __std_unique_2(void* _First, void* _Last) noexcept; +void* __stdcall __std_unique_4(void* _First, void* _Last) noexcept; +void* __stdcall __std_unique_8(void* _First, void* _Last) noexcept; } // extern "C" _STD_BEGIN @@ -207,6 +212,27 @@ __declspec(noalias) void _Replace_vectorized( } } +template +_Ty* _Unique_vectorized(_Ty* const _First, _Ty* const _Last) noexcept { + if constexpr (is_pointer_v<_Ty>) { +#ifdef _WIN64 + return reinterpret_cast<_Ty*>(::__std_unique_8(_First, _Last)); +#else + return reinterpret_cast<_Ty*>(::__std_unique_4(_First, _Last)); +#endif + } else if constexpr (sizeof(_Ty) == 1) { + return reinterpret_cast<_Ty*>(::__std_unique_1(_First, _Last)); + } else if constexpr (sizeof(_Ty) == 2) { + return reinterpret_cast<_Ty*>(::__std_unique_2(_First, _Last)); + } else if constexpr (sizeof(_Ty) == 4) { + return reinterpret_cast<_Ty*>(::__std_unique_4(_First, _Last)); + } else if constexpr (sizeof(_Ty) == 8) { + return reinterpret_cast<_Ty*>(::__std_unique_8(_First, _Last)); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // Unexpected size + } +} + // Can we activate the vector algorithms for find_first_of? template constexpr bool _Vector_alg_in_find_first_of_is_safe = _Equal_memcmp_is_safe<_It1, _It2, _Pr>; @@ -221,6 +247,17 @@ template constexpr bool _Vector_alg_in_ranges_replace_is_safe = _Vector_alg_in_replace_is_safe<_Iter, _Ty1> // can search and replace && _Vector_alg_in_find_is_safe_elem<_Ty2, _Iter_value_t<_Iter>>; // replacement fits + +// Can we activate the vector algorithms for unique? +template +constexpr bool _Vector_alg_in_unique_is_safe = + _Iterator_is_contiguous<_Iter> && !_Iterator_is_volatile<_Iter> // Contiguous nonvolatile iterator + && _Is_any_of_v<_Pr, +#if _HAS_CXX20 + _RANGES equal_to, +#endif // _HAS_CXX20 + _STD equal_to<>> // default comparison + && disjunction_v>, is_pointer<_Iter_value_t<_Iter>>>; // bitwise comparable _STD_END #endif // _USE_STD_VECTOR_ALGORITHMS @@ -4853,6 +4890,25 @@ _NODISCARD_UNIQUE_ALG _CONSTEXPR20 _FwdIt unique(_FwdIt _First, _FwdIt _Last, _P _STD _Adl_verify_range(_First, _Last); auto _UFirst = _STD _Get_unwrapped(_First); const auto _ULast = _STD _Get_unwrapped(_Last); + +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Vector_alg_in_unique_is_safe) { + if (!_STD _Is_constant_evaluated()) { + const auto _First_ptr = _STD _To_address(_UFirst); + const auto _Result = _STD _Unique_vectorized(_First_ptr, _STD _To_address(_ULast)); + + if constexpr (is_pointer_v) { + _UFirst = _Result; + } else { + _UFirst += _Result - _First_ptr; + } + + _STD _Seek_wrapped(_Last, _UFirst); + return _Last; + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + if (_UFirst != _ULast) { for (auto _UFirstb = _UFirst; ++_UFirst != _ULast; _UFirstb = _UFirst) { if (_Pred(*_UFirstb, *_UFirst)) { // copy down @@ -4929,6 +4985,24 @@ namespace ranges { _STL_INTERNAL_STATIC_ASSERT(sentinel_for<_Se, _It>); _STL_INTERNAL_STATIC_ASSERT(indirect_equivalence_relation<_Pr, projected<_It, _Pj>>); +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (is_same_v<_Pj, identity> && sized_sentinel_for<_Se, _It> + && _Vector_alg_in_unique_is_safe<_It, _Pr>) { + if (!_STD is_constant_evaluated()) { + const auto _Size = _Last - _First; + const auto _First_ptr = _STD to_address(_First); + const auto _Last_ptr = _First_ptr + static_cast(_Size); + const auto _Result = _STD _Unique_vectorized(_First_ptr, _Last_ptr); + + if constexpr (is_pointer_v<_It>) { + return {_Result, _Last_ptr}; + } else { + return {_First + (_Result - _First_ptr), _First + _Size}; + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + auto _Current = _First; if (_First == _Last) { return {_STD move(_Current), _STD move(_First)}; diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e89a0fba91..805b95323e 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -4134,6 +4134,24 @@ namespace { return _Dest; } + template + void* _Unique_fallback(void* const _First, void* const _Last, void* const _Dest) { + _Ty* _Out = reinterpret_cast<_Ty*>(_Dest); + _Ty* _Src = reinterpret_cast<_Ty*>(_First); + + while (_Src != _Last) { + if (*_Src != *_Out) { + ++_Out; + *_Out = *_Src; + } + + ++_Src; + } + + ++_Out; + return _Out; + } + #ifndef _M_ARM64EC template struct _Remove_tables { @@ -4324,6 +4342,188 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ return _Remove_fallback(_First, _Last, _Out, _Val); } +void* __stdcall __std_unique_1(void* _First, void* _Last) noexcept { + if (_First == _Last) { + return _First; + } + + void* _Dest = _First; + _Advance_bytes(_First, 1); + +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) { + _Advance_bytes(_Dest, 1); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{7}); + do { + const __m128i _Src = _mm_loadu_si64(_First); + void* _First_d = _First; + _Rewind_bytes(_First_d, 1); + const __m128i _Match = _mm_loadu_si64(_First_d); + const uint32_t _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; + const __m128i _Shuf = _mm_loadu_si64(_Remove_tables_1_sse._Shuf[_Bingo]); + const __m128i _Out = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si64(_Dest, _Out); + _Advance_bytes(_Dest, _Remove_tables_1_sse._Size[_Bingo]); + _Advance_bytes(_First, 8); + } while (_First != _Stop); + + _Rewind_bytes(_Dest, 1); + } +#endif // !defined(_M_ARM64EC) + + return _Unique_fallback(_First, _Last, _Dest); +} + +void* __stdcall __std_unique_2(void* _First, void* _Last) noexcept { + if (_First == _Last) { + return _First; + } + + void* _Dest = _First; + _Advance_bytes(_First, 2); + +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) { + _Advance_bytes(_Dest, 2); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + do { + const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); + void* _First_d = _First; + _Rewind_bytes(_First_d, 2); + const __m128i _Match = _mm_loadu_si128(reinterpret_cast(_First_d)); + const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); + const uint32_t _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_2_sse._Shuf[_Bingo])); + const __m128i _Out = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Out); + _Advance_bytes(_Dest, _Remove_tables_2_sse._Size[_Bingo]); + _Advance_bytes(_First, 16); + } while (_First != _Stop); + + _Rewind_bytes(_Dest, 2); + } +#endif // !defined(_M_ARM64EC) + + return _Unique_fallback(_First, _Last, _Dest); +} + +void* __stdcall __std_unique_4(void* _First, void* _Last) noexcept { + if (_First == _Last) { + return _First; + } + + void* _Dest = _First; + _Advance_bytes(_First, 4); + +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + _Advance_bytes(_Dest, 4); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + + do { + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + void* _First_d = _First; + _Rewind_bytes(_First_d, 4); + const __m256i _Match = _mm256_loadu_si256(reinterpret_cast(_First_d)); + const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); + const uint32_t _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo])); + const __m256i _Out = _mm256_permutevar8x32_epi32(_Src, _Shuf); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Out); + _Advance_bytes(_Dest, _Remove_tables_4_avx._Size[_Bingo]); + _Advance_bytes(_First, 32); + } while (_First != _Stop); + + _Rewind_bytes(_Dest, 4); + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } else if (_Use_sse42() && _Size_bytes >= 16) { + _Advance_bytes(_Dest, 4); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + do { + const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); + void* _First_d = _First; + _Rewind_bytes(_First_d, 4); + const __m128i _Match = _mm_loadu_si128(reinterpret_cast(_First_d)); + const __m128i _Mask = _mm_cmpeq_epi32(_Src, _Match); + const uint32_t _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask)); + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_4_sse._Shuf[_Bingo])); + const __m128i _Out = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Out); + _Advance_bytes(_Dest, _Remove_tables_4_sse._Size[_Bingo]); + _Advance_bytes(_First, 16); + } while (_First != _Stop); + + _Rewind_bytes(_Dest, 4); + } +#endif // !defined(_M_ARM64EC) + + return _Unique_fallback(_First, _Last, _Dest); +} + +void* __stdcall __std_unique_8(void* _First, void* _Last) noexcept { + if (_First == _Last) { + return _First; + } + + void* _Dest = _First; + _Advance_bytes(_First, 8); + +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + _Advance_bytes(_Dest, 8); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + do { + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + void* _First_d = _First; + _Rewind_bytes(_First_d, 8); + const __m256i _Match = _mm256_loadu_si256(reinterpret_cast(_First_d)); + const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); + const uint32_t _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo])); + const __m256i _Out = _mm256_permutevar8x32_epi32(_Src, _Shuf); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Out); + _Advance_bytes(_Dest, _Remove_tables_8_avx._Size[_Bingo]); + _Advance_bytes(_First, 32); + } while (_First != _Stop); + + _Rewind_bytes(_Dest, 8); + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } else if (_Use_sse42() && _Size_bytes >= 16) { + _Advance_bytes(_Dest, 8); + + void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + do { + const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); + void* _First_d = _First; + _Rewind_bytes(_First_d, 8); + const __m128i _Match = _mm_loadu_si128(reinterpret_cast(_First_d)); + const __m128i _Mask = _mm_cmpeq_epi64(_Src, _Match); + const uint32_t _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask)); + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_8_sse._Shuf[_Bingo])); + const __m128i _Out = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Out); + _Advance_bytes(_Dest, _Remove_tables_8_sse._Size[_Bingo]); + _Advance_bytes(_First, 16); + } while (_First != _Stop); + + _Rewind_bytes(_Dest, 8); + } +#endif // !defined(_M_ARM64EC) + + return _Unique_fallback(_First, _Last, _Dest); +} + } // extern "C" namespace { diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index e7aec69bea..9460b2679b 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -696,6 +696,68 @@ void test_remove(mt19937_64& gen) { } } +template +FwdIt last_known_good_unique(FwdIt first, FwdIt last) { + if (first == last) { + return first; + } + + FwdIt dest = first; + ++first; + + while (first != last) { + if (*first != *dest) { + ++dest; + *dest = *first; + } + + ++first; + } + + ++dest; + return dest; +} + +template +void test_case_unique(vector& in_out_expected, vector& in_out_actual, vector& in_out_actual_r) { + auto un_expected = last_known_good_unique(in_out_expected.begin(), in_out_expected.end()); + auto un_actual = unique(in_out_actual.begin(), in_out_actual.end()); + assert(equal(in_out_expected.begin(), un_expected, in_out_actual.begin(), un_actual)); + +#if _HAS_CXX20 + auto un_actual_r = ranges::unique(in_out_actual_r); + assert(equal(in_out_expected.begin(), un_expected, begin(in_out_actual_r), begin(un_actual_r))); +#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv + (void) in_out_actual_r; +#endif // ^^^ !_HAS_CXX20 ^^^ +} + +template +void test_unique(mt19937_64& gen) { + using TD = conditional_t; + binomial_distribution dis(5); + + vector source; + vector in_out_expected; + vector in_out_actual; + vector in_out_actual_r; + + for (const auto& v : {&source, &in_out_expected, &in_out_actual, &in_out_actual_r}) { + v->reserve(dataCount); + } + + test_case_unique(in_out_expected, in_out_actual, in_out_actual_r); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + source.push_back(static_cast(dis(gen))); + + for (const auto& v : {&in_out_expected, &in_out_actual, &in_out_actual_r}) { + *v = source; + } + + test_case_unique(in_out_expected, in_out_actual, in_out_actual_r); + } +} + template void test_swap_ranges(mt19937_64& gen) { const auto fn = [&]() { return static_cast(gen()); }; @@ -890,6 +952,16 @@ void test_vector_algorithms(mt19937_64& gen) { test_remove(gen); test_remove(gen); + test_unique(gen); + test_unique(gen); + test_unique(gen); + test_unique(gen); + test_unique(gen); + test_unique(gen); + test_unique(gen); + test_unique(gen); + test_unique(gen); + test_swap_ranges(gen); test_swap_ranges(gen); test_swap_ranges(gen); From a0b714d1bc92e2df742f0fcda4c29161fd39914b Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 16 Nov 2024 23:18:26 +0200 Subject: [PATCH 2/5] no point --- stl/inc/algorithm | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 33a8e6740d..9f38c16466 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -214,13 +214,7 @@ __declspec(noalias) void _Replace_vectorized( template _Ty* _Unique_vectorized(_Ty* const _First, _Ty* const _Last) noexcept { - if constexpr (is_pointer_v<_Ty>) { -#ifdef _WIN64 - return reinterpret_cast<_Ty*>(::__std_unique_8(_First, _Last)); -#else - return reinterpret_cast<_Ty*>(::__std_unique_4(_First, _Last)); -#endif - } else if constexpr (sizeof(_Ty) == 1) { + if constexpr (sizeof(_Ty) == 1) { return reinterpret_cast<_Ty*>(::__std_unique_1(_First, _Last)); } else if constexpr (sizeof(_Ty) == 2) { return reinterpret_cast<_Ty*>(::__std_unique_2(_First, _Last)); From cccf693dc2c109e1c60f47ad890ba3f58562fa60 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 16 Nov 2024 23:27:08 +0200 Subject: [PATCH 3/5] Not unique problem --- stl/inc/algorithm | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 9f38c16466..02486abced 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -244,14 +244,7 @@ constexpr bool _Vector_alg_in_ranges_replace_is_safe = // Can we activate the vector algorithms for unique? template -constexpr bool _Vector_alg_in_unique_is_safe = - _Iterator_is_contiguous<_Iter> && !_Iterator_is_volatile<_Iter> // Contiguous nonvolatile iterator - && _Is_any_of_v<_Pr, -#if _HAS_CXX20 - _RANGES equal_to, -#endif // _HAS_CXX20 - _STD equal_to<>> // default comparison - && disjunction_v>, is_pointer<_Iter_value_t<_Iter>>>; // bitwise comparable +constexpr bool _Vector_alg_in_unique_is_safe = _Equal_memcmp_is_safe<_Iter, _Iter, _Pr>; _STD_END #endif // _USE_STD_VECTOR_ALGORITHMS From 54781db973a58023d7054c63b5e4514783f9959c Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 16 Nov 2024 23:38:55 +0200 Subject: [PATCH 4/5] Pointed out coverage --- .../VSO_0000000_vector_algorithms/test.cpp | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 9460b2679b..53c5d0846c 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -734,8 +735,16 @@ void test_case_unique(vector& in_out_expected, vector& in_out_actual, vect template void test_unique(mt19937_64& gen) { - using TD = conditional_t; - binomial_distribution dis(5); + shared_ptr tmp_array; + + constexpr int number_of_values = 5; + + using TD = conditional_t, int, T>; + binomial_distribution dis(number_of_values); + + if constexpr (is_pointer_v) { + tmp_array = std::make_unique[]>(5); + } vector source; vector in_out_expected; @@ -748,7 +757,12 @@ void test_unique(mt19937_64& gen) { test_case_unique(in_out_expected, in_out_actual, in_out_actual_r); for (size_t attempts = 0; attempts < dataCount; ++attempts) { - source.push_back(static_cast(dis(gen))); + if constexpr (is_pointer_v) { + const auto pos = static_cast(dis(gen)); + source.push_back(static_cast(tmp_array.get()) + pos); + } else { + source.push_back(static_cast(dis(gen))); + } for (const auto& v : {&in_out_expected, &in_out_actual, &in_out_actual_r}) { *v = source; @@ -962,6 +976,8 @@ void test_vector_algorithms(mt19937_64& gen) { test_unique(gen); test_unique(gen); + test_unique(gen); + test_swap_ranges(gen); test_swap_ranges(gen); test_swap_ranges(gen); From fa4ff204636dd5e24fdd1e30c95fcde1da43ae51 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 17 Nov 2024 01:50:38 +0200 Subject: [PATCH 5/5] Deduplicate Less error prone, especially if implementing _copy someday --- stl/src/vector_algorithms.cpp | 384 +++++++++++++++++++--------------- 1 file changed, 213 insertions(+), 171 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 805b95323e..4017a3fbc1 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -4204,6 +4204,195 @@ namespace { constexpr auto _Remove_tables_4_avx = _Make_remove_tables<256, 8>(4, 1); constexpr auto _Remove_tables_8_sse = _Make_remove_tables<4, 16>(8, 8); constexpr auto _Remove_tables_8_avx = _Make_remove_tables<16, 8>(8, 2); + + struct _Remove_sse_1 { + static constexpr size_t _Elem_size = 1; + static constexpr size_t _Step = 8; + + static __m128i _Set(const uint8_t _Val) noexcept { + return _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128()); + } + + static __m128i _Load(const void* const _Ptr) noexcept { + return _mm_loadu_si64(_Ptr); + } + + static uint32_t _Mask(const __m128i _First, const __m128i _Second) noexcept { + return _mm_movemask_epi8(_mm_cmpeq_epi8(_First, _Second)) & 0xFF; + } + + static void* _Store_masked(void* _Out, const __m128i _Src, const uint32_t _Bingo) noexcept { + const __m128i _Shuf = _mm_loadu_si64(_Remove_tables_1_sse._Shuf[_Bingo]); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si64(_Out, _Dest); + _Advance_bytes(_Out, _Remove_tables_1_sse._Size[_Bingo]); + return _Out; + } + }; + + struct _Remove_sse_2 { + static constexpr size_t _Elem_size = 2; + static constexpr size_t _Step = 16; + + static __m128i _Set(const uint16_t _Val) noexcept { + return _mm_set1_epi16(_Val); + } + + static __m128i _Load(const void* const _Ptr) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Ptr)); + } + + static uint32_t _Mask(const __m128i _First, const __m128i _Second) noexcept { + const __m128i _Mask = _mm_cmpeq_epi16(_First, _Second); + return _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); + } + + static void* _Store_masked(void* _Out, const __m128i _Src, const uint32_t _Bingo) noexcept { + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_2_sse._Shuf[_Bingo])); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_tables_2_sse._Size[_Bingo]); + return _Out; + } + }; + + struct _Remove_avx_4 { + static constexpr size_t _Elem_size = 4; + static constexpr size_t _Step = 32; + + static __m256i _Set(const uint32_t _Val) noexcept { + return _mm256_set1_epi32(_Val); + } + + static __m256i _Load(const void* const _Ptr) noexcept { + return _mm256_loadu_si256(reinterpret_cast(_Ptr)); + } + + static uint32_t _Mask(const __m256i _First, const __m256i _Second) noexcept { + const __m256i _Mask = _mm256_cmpeq_epi32(_First, _Second); + return _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + } + + static void* _Store_masked(void* _Out, const __m256i _Src, const uint32_t _Bingo) noexcept { + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_tables_4_avx._Size[_Bingo]); + return _Out; + } + }; + + struct _Remove_sse_4 { + static constexpr size_t _Elem_size = 4; + static constexpr size_t _Step = 16; + + static __m128i _Set(const uint32_t _Val) noexcept { + return _mm_set1_epi32(_Val); + } + + static __m128i _Load(const void* const _Ptr) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Ptr)); + } + + static uint32_t _Mask(const __m128i _First, const __m128i _Second) noexcept { + const __m128i _Mask = _mm_cmpeq_epi32(_First, _Second); + return _mm_movemask_ps(_mm_castsi128_ps(_Mask)); + } + + static void* _Store_masked(void* _Out, const __m128i _Src, const uint32_t _Bingo) noexcept { + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_4_sse._Shuf[_Bingo])); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_tables_4_sse._Size[_Bingo]); + return _Out; + } + }; + + struct _Remove_avx_8 { + static constexpr size_t _Elem_size = 8; + static constexpr size_t _Step = 32; + + static __m256i _Set(const uint64_t _Val) noexcept { + return _mm256_set1_epi64x(_Val); + } + + static __m256i _Load(const void* const _Ptr) noexcept { + return _mm256_loadu_si256(reinterpret_cast(_Ptr)); + } + + static uint32_t _Mask(const __m256i _First, const __m256i _Second) noexcept { + const __m256i _Mask = _mm256_cmpeq_epi64(_First, _Second); + return _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); + } + + static void* _Store_masked(void* _Out, const __m256i _Src, const uint32_t _Bingo) noexcept { + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo])); + const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_tables_8_avx._Size[_Bingo]); + return _Out; + } + }; + + struct _Remove_sse_8 { + static constexpr size_t _Elem_size = 8; + static constexpr size_t _Step = 16; + + static __m128i _Set(const uint64_t _Val) noexcept { + return _mm_set1_epi64x(_Val); + } + + static __m128i _Load(const void* const _Ptr) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Ptr)); + } + + static uint32_t _Mask(const __m128i _First, const __m128i _Second) noexcept { + const __m128i _Mask = _mm_cmpeq_epi64(_First, _Second); + return _mm_movemask_pd(_mm_castsi128_pd(_Mask)); + } + + static void* _Store_masked(void* _Out, const __m128i _Src, const uint32_t _Bingo) noexcept { + const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_8_sse._Shuf[_Bingo])); + const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); + _Advance_bytes(_Out, _Remove_tables_8_sse._Size[_Bingo]); + return _Out; + } + }; + + template + void* _Remove_impl(void* _First, const void* _Stop, const auto _Val) noexcept { + void* _Out = _First; + const auto _Match = _Traits::_Set(_Val); + + do { + const auto _Src = _Traits::_Load(_First); + const uint32_t _Bingo = _Traits::_Mask(_Src, _Match); + _Out = _Traits::_Store_masked(_Out, _Src, _Bingo); + _Advance_bytes(_First, _Traits::_Step); + } while (_First != _Stop); + + return _Out; + } + + template + void* _Unique_impl(void* _First, const void* _Stop) noexcept { + void* _Out = _First; + + do { + const auto _Src = _Traits::_Load(_First); + void* _First_d = _First; + _Rewind_bytes(_First_d, _Traits::_Elem_size); + const auto _Match = _Traits::_Load(_First_d); + const uint32_t _Bingo = _Traits::_Mask(_Src, _Match); + _Out = _Traits::_Store_masked(_Out, _Src, _Bingo); + _Advance_bytes(_First, _Traits::_Step); + } while (_First != _Stop); + + _Rewind_bytes(_Out, _Traits::_Elem_size); + return _Out; + } + #endif // !defined(_M_ARM64EC) } // unnamed namespace @@ -4214,19 +4403,10 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) { - const __m128i _Match = _mm_shuffle_epi8(_mm_cvtsi32_si128(_Val), _mm_setzero_si128()); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{7}); - do { - const __m128i _Src = _mm_loadu_si64(_First); - const uint32_t _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; - const __m128i _Shuf = _mm_loadu_si64(_Remove_tables_1_sse._Shuf[_Bingo]); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si64(_Out, _Dest); - _Advance_bytes(_Out, _Remove_tables_1_sse._Size[_Bingo]); - _Advance_bytes(_First, 8); - } while (_First != _Stop); + _Out = _Remove_impl<_Remove_sse_1>(_First, _Stop, _Val); + _First = _Stop; } #endif // !defined(_M_ARM64EC) @@ -4238,20 +4418,10 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) { - const __m128i _Match = _mm_set1_epi16(_Val); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); - do { - const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); - const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); - const uint32_t _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_2_sse._Shuf[_Bingo])); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_2_sse._Size[_Bingo]); - _Advance_bytes(_First, 16); - } while (_First != _Stop); + _Out = _Remove_impl<_Remove_sse_2>(_First, _Stop, _Val); + _First = _Stop; } #endif // !defined(_M_ARM64EC) @@ -4263,37 +4433,17 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { - const __m256i _Match = _mm256_set1_epi32(_Val); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); - do { - const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); - const uint32_t _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_4_avx._Size[_Bingo]); - _Advance_bytes(_First, 32); - } while (_First != _Stop); + _Out = _Remove_impl<_Remove_avx_4>(_First, _Stop, _Val); + _First = _Stop; _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } else if (_Use_sse42() && _Size_bytes >= 16) { - const __m128i _Match = _mm_set1_epi32(_Val); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); - do { - const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); - const __m128i _Mask = _mm_cmpeq_epi32(_Src, _Match); - const uint32_t _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask)); - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_4_sse._Shuf[_Bingo])); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_4_sse._Size[_Bingo]); - _Advance_bytes(_First, 16); - } while (_First != _Stop); + _Out = _Remove_impl<_Remove_sse_4>(_First, _Stop, _Val); + _First = _Stop; } #endif // !defined(_M_ARM64EC) @@ -4305,37 +4455,17 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { - const __m256i _Match = _mm256_set1_epi64x(_Val); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); - do { - const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); - const uint32_t _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_8_avx._Size[_Bingo]); - _Advance_bytes(_First, 32); - } while (_First != _Stop); + _Out = _Remove_impl<_Remove_avx_8>(_First, _Stop, _Val); + _First = _Stop; _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } else if (_Use_sse42() && _Size_bytes >= 16) { - const __m128i _Match = _mm_set1_epi64x(_Val); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); - do { - const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); - const __m128i _Mask = _mm_cmpeq_epi64(_Src, _Match); - const uint32_t _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask)); - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_8_sse._Shuf[_Bingo])); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_8_sse._Size[_Bingo]); - _Advance_bytes(_First, 16); - } while (_First != _Stop); + _Out = _Remove_impl<_Remove_sse_8>(_First, _Stop, _Val); + _First = _Stop; } #endif // !defined(_M_ARM64EC) @@ -4352,24 +4482,10 @@ void* __stdcall __std_unique_1(void* _First, void* _Last) noexcept { #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) { - _Advance_bytes(_Dest, 1); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{7}); - do { - const __m128i _Src = _mm_loadu_si64(_First); - void* _First_d = _First; - _Rewind_bytes(_First_d, 1); - const __m128i _Match = _mm_loadu_si64(_First_d); - const uint32_t _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; - const __m128i _Shuf = _mm_loadu_si64(_Remove_tables_1_sse._Shuf[_Bingo]); - const __m128i _Out = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si64(_Dest, _Out); - _Advance_bytes(_Dest, _Remove_tables_1_sse._Size[_Bingo]); - _Advance_bytes(_First, 8); - } while (_First != _Stop); - - _Rewind_bytes(_Dest, 1); + _Dest = _Unique_impl<_Remove_sse_1>(_First, _Stop); + _First = _Stop; } #endif // !defined(_M_ARM64EC) @@ -4386,25 +4502,10 @@ void* __stdcall __std_unique_2(void* _First, void* _Last) noexcept { #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) { - _Advance_bytes(_Dest, 2); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); - do { - const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); - void* _First_d = _First; - _Rewind_bytes(_First_d, 2); - const __m128i _Match = _mm_loadu_si128(reinterpret_cast(_First_d)); - const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); - const uint32_t _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_2_sse._Shuf[_Bingo])); - const __m128i _Out = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Out); - _Advance_bytes(_Dest, _Remove_tables_2_sse._Size[_Bingo]); - _Advance_bytes(_First, 16); - } while (_First != _Stop); - - _Rewind_bytes(_Dest, 2); + _Dest = _Unique_impl<_Remove_sse_2>(_First, _Stop); + _First = _Stop; } #endif // !defined(_M_ARM64EC) @@ -4421,47 +4522,17 @@ void* __stdcall __std_unique_4(void* _First, void* _Last) noexcept { #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { - _Advance_bytes(_Dest, 4); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + _Dest = _Unique_impl<_Remove_avx_4>(_First, _Stop); + _First = _Stop; - do { - const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - void* _First_d = _First; - _Rewind_bytes(_First_d, 4); - const __m256i _Match = _mm256_loadu_si256(reinterpret_cast(_First_d)); - const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); - const uint32_t _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo])); - const __m256i _Out = _mm256_permutevar8x32_epi32(_Src, _Shuf); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Out); - _Advance_bytes(_Dest, _Remove_tables_4_avx._Size[_Bingo]); - _Advance_bytes(_First, 32); - } while (_First != _Stop); - - _Rewind_bytes(_Dest, 4); _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } else if (_Use_sse42() && _Size_bytes >= 16) { - _Advance_bytes(_Dest, 4); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); - do { - const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); - void* _First_d = _First; - _Rewind_bytes(_First_d, 4); - const __m128i _Match = _mm_loadu_si128(reinterpret_cast(_First_d)); - const __m128i _Mask = _mm_cmpeq_epi32(_Src, _Match); - const uint32_t _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask)); - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_4_sse._Shuf[_Bingo])); - const __m128i _Out = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Out); - _Advance_bytes(_Dest, _Remove_tables_4_sse._Size[_Bingo]); - _Advance_bytes(_First, 16); - } while (_First != _Stop); - - _Rewind_bytes(_Dest, 4); + _Dest = _Unique_impl<_Remove_sse_4>(_First, _Stop); + _First = _Stop; } #endif // !defined(_M_ARM64EC) @@ -4478,46 +4549,17 @@ void* __stdcall __std_unique_8(void* _First, void* _Last) noexcept { #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { - _Advance_bytes(_Dest, 8); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); - do { - const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); - void* _First_d = _First; - _Rewind_bytes(_First_d, 8); - const __m256i _Match = _mm256_loadu_si256(reinterpret_cast(_First_d)); - const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); - const uint32_t _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); - const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo])); - const __m256i _Out = _mm256_permutevar8x32_epi32(_Src, _Shuf); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Out); - _Advance_bytes(_Dest, _Remove_tables_8_avx._Size[_Bingo]); - _Advance_bytes(_First, 32); - } while (_First != _Stop); + _Dest = _Unique_impl<_Remove_avx_8>(_First, _Stop); + _First = _Stop; - _Rewind_bytes(_Dest, 8); _mm256_zeroupper(); // TRANSITION, DevCom-10331414 } else if (_Use_sse42() && _Size_bytes >= 16) { - _Advance_bytes(_Dest, 8); - void* _Stop = _First; _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); - do { - const __m128i _Src = _mm_loadu_si128(reinterpret_cast(_First)); - void* _First_d = _First; - _Rewind_bytes(_First_d, 8); - const __m128i _Match = _mm_loadu_si128(reinterpret_cast(_First_d)); - const __m128i _Mask = _mm_cmpeq_epi64(_Src, _Match); - const uint32_t _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask)); - const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_8_sse._Shuf[_Bingo])); - const __m128i _Out = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Out); - _Advance_bytes(_Dest, _Remove_tables_8_sse._Size[_Bingo]); - _Advance_bytes(_First, 16); - } while (_First != _Stop); - - _Rewind_bytes(_Dest, 8); + _Dest = _Unique_impl<_Remove_sse_8>(_First, _Stop); + _First = _Stop; } #endif // !defined(_M_ARM64EC)