Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve basic_string::find_first_of and basic_string::find_last_of vectorization for large needles or very large haystacks #5029

Open
wants to merge 46 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
f0e3c43
find meow of bitmap
AlexGuteniev Oct 20, 2024
019bc07
big characters threshold
AlexGuteniev Oct 20, 2024
7b8ebfd
vectorize large needles too
AlexGuteniev Oct 20, 2024
7f62323
arm64ec
AlexGuteniev Oct 20, 2024
6e08e9e
missed noexcept
AlexGuteniev Oct 20, 2024
102b85f
More interesting cases
AlexGuteniev Oct 20, 2024
12f6112
Not that edge!
AlexGuteniev Oct 20, 2024
6ee0450
+case
AlexGuteniev Oct 20, 2024
3102cee
Merge remote-tracking branch 'upstream/main' into ascii-table
AlexGuteniev Oct 25, 2024
31465e5
Merge remote-tracking branch 'upstream/main' into ascii-table
AlexGuteniev Oct 27, 2024
60bbd79
make bitmap small and large
AlexGuteniev Oct 27, 2024
8caa547
change strategy
AlexGuteniev Oct 27, 2024
6f6f97b
ensure the same (mis)alignment
AlexGuteniev Oct 27, 2024
1c76e30
Brute force SSE 4.2 thresholds
AlexGuteniev Oct 27, 2024
4560fdf
update not aligned allocator
AlexGuteniev Oct 27, 2024
05deddb
not_highly_aligned_allocator update
AlexGuteniev Oct 29, 2024
64e9df0
ASan fixes
AlexGuteniev Oct 29, 2024
ac00cf6
Check last part size for zero
AlexGuteniev Oct 29, 2024
3132911
typo
AlexGuteniev Oct 29, 2024
02881d5
Merge remote-tracking branch 'upstream/main' into ascii-table
AlexGuteniev Oct 30, 2024
b34af28
GH-5043 usage
AlexGuteniev Oct 30, 2024
41fe19d
Merge branch 'main' into ascii-table
StephanTLavavej Nov 8, 2024
ee48d32
Use `static_cast` instead of a functional-style cast.
StephanTLavavej Nov 8, 2024
5323a0b
`unsigned` => `unsigned int`
StephanTLavavej Nov 8, 2024
35379a9
Drop unnecessary parens.
StephanTLavavej Nov 8, 2024
6dcf3ba
Conditional operator => if-else
StephanTLavavej Nov 8, 2024
f6f95dd
Add const.
StephanTLavavej Nov 8, 2024
f54b728
Don't return `const size_t`.
StephanTLavavej Nov 8, 2024
267a679
Add `noexcept`.
StephanTLavavej Nov 8, 2024
3768748
separate steps for small bitmap differently
AlexGuteniev Nov 9, 2024
9964a16
yet better name
AlexGuteniev Nov 9, 2024
7a08a13
We should avoid broadcast
AlexGuteniev Nov 9, 2024
abd5ee9
Should inline actually
AlexGuteniev Nov 9, 2024
bc0decf
Use reference-to-array parameters.
StephanTLavavej Nov 9, 2024
58fa2b4
Mark `_Build_scalar_table` as `[[nodiscard]]`.
StephanTLavavej Nov 9, 2024
73e704d
Avoid `_First` and `_Last` sub-namespaces.
StephanTLavavej Nov 9, 2024
67b9319
Pure code movement: Move bitmap details before "public" machinery.
StephanTLavavej Nov 9, 2024
016ee5c
Avoid `_Bitmap` sub-namespace, extract details.
StephanTLavavej Nov 9, 2024
60637a1
Rename to `_Bitmap_step`.
StephanTLavavej Nov 9, 2024
07b04b7
Fix typo: `_Mask_out_oveflow` => `_Mask_out_overflow`
StephanTLavavej Nov 9, 2024
8b0380d
Drop spurious spaces in preprocessor comments.
StephanTLavavej Nov 9, 2024
cb766bd
AVX2 vpermq form
AlexGuteniev Nov 9, 2024
3fc4cbd
vzeroupper guards
AlexGuteniev Nov 9, 2024
9d37e25
AVX2 masks for bitmap algorithm
AlexGuteniev Nov 10, 2024
cde8eb6
Merge remote-tracking branch 'upstream/main' into ascii-table
AlexGuteniev Nov 23, 2024
4961641
restore strategy. set avx bitmap threshold
AlexGuteniev Nov 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions benchmarks/src/find_first_of.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include <type_traits>
#include <vector>

#include "skewed_allocator.hpp"

using namespace std;

enum class AlgType { std_func, str_member_first, str_member_last };
Expand All @@ -24,7 +26,8 @@ void bm(benchmark::State& state) {
const size_t HSize = Pos * 2;
const size_t Which = 0;

using container = conditional_t<Alg == AlgType::std_func, vector<T>, basic_string<T>>;
using container = conditional_t<Alg == AlgType::std_func, vector<T, not_highly_aligned_allocator<T>>,
basic_string<T, char_traits<T>, not_highly_aligned_allocator<T>>>;

constexpr T HaystackFiller{' '};
static_assert(HaystackFiller < Start, "The following iota() should not produce the haystack filler.");
Expand Down Expand Up @@ -59,8 +62,9 @@ void bm(benchmark::State& state) {
}

void common_args(auto bm) {
bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2})->Args({102, 4});
bm->Args({325, 1})->Args({400, 50})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7});
bm->Args({2, 3})->Args({6, 81})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2});
bm->Args({75, 85})->Args({102, 4})->Args({200, 46})->Args({325, 1})->Args({400, 50});
bm->Args({1011, 11})->Args({1280, 46})->Args({1502, 23})->Args({2203, 54})->Args({3056, 7});
}

BENCHMARK(bm<AlgType::std_func, uint8_t>)->Apply(common_args);
Expand Down
128 changes: 61 additions & 67 deletions stl/inc/__msvc_string_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ extern "C" {
// compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to
// unanalyzable routines may modify those arrays.

__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_1(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_2(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_4(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
__declspec(noalias) size_t __stdcall __std_find_first_of_trivial_pos_8(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;

__declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
__declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(
Expand All @@ -38,6 +47,23 @@ __declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(

_STD_BEGIN

template <class _Ty1, class _Ty2>
size_t _Find_first_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length,
const _Ty2* const _Needle, const size_t _Needle_length) noexcept {
_STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));
if constexpr (sizeof(_Ty1) == 1) {
return ::__std_find_first_of_trivial_pos_1(_Haystack, _Haystack_length, _Needle, _Needle_length);
} else if constexpr (sizeof(_Ty1) == 2) {
return ::__std_find_first_of_trivial_pos_2(_Haystack, _Haystack_length, _Needle, _Needle_length);
} else if constexpr (sizeof(_Ty1) == 4) {
return ::__std_find_first_of_trivial_pos_4(_Haystack, _Haystack_length, _Needle, _Needle_length);
} else if constexpr (sizeof(_Ty1) == 8) {
return ::__std_find_first_of_trivial_pos_8(_Haystack, _Haystack_length, _Needle, _Needle_length);
} else {
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
}
}

template <class _Ty1, class _Ty2>
size_t _Find_last_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length,
const _Ty2* const _Needle, const size_t _Needle_length) noexcept {
Expand Down Expand Up @@ -817,48 +843,31 @@ constexpr size_t _Traits_find_first_of(_In_reads_(_Hay_size) const _Traits_ptr_t
const auto _Hay_end = _Haystack + _Hay_size;

if constexpr (_Is_implementation_handled_char_traits<_Traits>) {
if (!_STD _Is_constant_evaluated()) {
using _Elem = typename _Traits::char_type;

#if _USE_STD_VECTOR_ALGORITHMS
const bool _Try_vectorize = _Hay_size - _Start_at > _Threshold_find_first_of;

// Additional condition for when the vectorization outperforms the table lookup
constexpr size_t _Find_first_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : sizeof(_Elem) == 8 ? 8 : 16;

const bool _Use_bitmap = !_Try_vectorize || _Needle_size > _Find_first_of_bitmap_threshold;
#else // ^^^ _USE_STD_VECTOR_ALGORITHMS / !_USE_STD_VECTOR_ALGORITHMS vvv
const bool _Use_bitmap = true;
#endif // ^^^ !_USE_STD_VECTOR_ALGORITHMS ^^^

if (_Use_bitmap) {
_String_bitmap<_Elem> _Matches;

if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) {
if (_Matches._Match(*_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}
}
return static_cast<size_t>(-1); // no match
if (!_STD _Is_constant_evaluated()) {
const size_t _Remaining_size = _Hay_size - _Start_at;
if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) {
size_t _Pos = _Find_first_of_pos_vectorized(_Hay_start, _Remaining_size, _Needle, _Needle_size);
if (_Pos != static_cast<size_t>(-1)) {
_Pos += _Start_at;
}

// couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms
return _Pos;
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS

#if _USE_STD_VECTOR_ALGORITHMS
if (_Try_vectorize) {
const _Traits_ptr_t<_Traits> _Found =
_STD _Find_first_of_vectorized(_Hay_start, _Hay_end, _Needle, _Needle + _Needle_size);

if (_Found != _Hay_end) {
return static_cast<size_t>(_Found - _Haystack); // found a match
} else {
return static_cast<size_t>(-1); // no match
_String_bitmap<typename _Traits::char_type> _Matches;

if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) {
if (_Matches._Match(*_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS
return static_cast<size_t>(-1); // no match
}

// couldn't put one of the characters into the bitmap, fall back to serial algorithm
}

for (auto _Match_try = _Hay_start; _Match_try < _Hay_end; ++_Match_try) {
Expand All @@ -882,47 +891,32 @@ constexpr size_t _Traits_find_last_of(_In_reads_(_Hay_size) const _Traits_ptr_t<
const auto _Hay_start = (_STD min)(_Start_at, _Hay_size - 1);

if constexpr (_Is_implementation_handled_char_traits<_Traits>) {
if (!_STD _Is_constant_evaluated()) {
using _Elem = typename _Traits::char_type;

bool _Use_bitmap = true;
using _Elem = typename _Traits::char_type;
#if _USE_STD_VECTOR_ALGORITHMS
bool _Try_vectorize = false;

if constexpr (sizeof(_Elem) <= 2) {
_Try_vectorize = _Hay_start + 1 > _Threshold_find_first_of;
// Additional condition for when the vectorization outperforms the table lookup
constexpr size_t _Find_last_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : 8;

_Use_bitmap = !_Try_vectorize || _Needle_size > _Find_last_of_bitmap_threshold;
if constexpr (sizeof(_Elem) <= 2) {
if (!_STD _Is_constant_evaluated()) {
const size_t _Remaining_size = _Hay_start + 1;
if (_Remaining_size + _Needle_size >= _Threshold_find_first_of) {
return _Find_last_of_pos_vectorized(_Haystack, _Remaining_size, _Needle, _Needle_size);
}
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS

if (_Use_bitmap) {
_String_bitmap<_Elem> _Matches;
if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
if (_Matches._Match(*_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}

if (_Match_try == _Haystack) {
return static_cast<size_t>(-1); // at beginning, no more chance for match
}
}
_String_bitmap<_Elem> _Matches;
if (_Matches._Mark(_Needle, _Needle + _Needle_size)) {
for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
if (_Matches._Match(*_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}

// couldn't put one of the characters into the bitmap, fall back to vectorized or serial algorithms
}

#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (sizeof(_Elem) <= 2) {
if (_Try_vectorize) {
return _STD _Find_last_of_pos_vectorized(_Haystack, _Hay_start + 1, _Needle, _Needle_size);
if (_Match_try == _Haystack) {
return static_cast<size_t>(-1); // at beginning, no more chance for match
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS
}

// couldn't put one of the characters into the bitmap, fall back to serial algorithm
}

for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
Expand Down
Loading