From f63722094b92019f365135bc2792f8a5997afbeb Mon Sep 17 00:00:00 2001 From: Levi Morrison Date: Tue, 7 May 2024 22:16:35 -0600 Subject: [PATCH] perf: increase min buckets on very small types Consider `HashSet` on x86_64 with SSE with various bucket sizes and how many bytes the allocation ends up being: | buckets | capacity | allocated bytes | | ------- | -------- | --------------- | | 4 | 3 | 36 | | 8 | 7 | 40 | | 16 | 14 | 48 | | 32 | 28 | 80 | In general, doubling the number of buckets should roughly double the number of bytes used. However, for small bucket sizes for these small TableLayouts (4 -> 8, 8 -> 16), it doesn't happen. This is an edge case which happens because of padding of the control bytes and adding the Group::WIDTH. Taking the buckets from 4 to 16 (4x) only takes the allocated bytes from 36 to 48 (~1.3x). This platform isn't the only one with edges. Here's aarch64 on an M1 for the same `HashSet`: | buckets | capacity | allocated bytes | | ------- | -------- | --------------- | | 4 | 3 | 20 | | 8 | 7 | 24 | | 16 | 14 | 40 | Notice 4 -> 8 buckets leading to only 4 more bytes (20 -> 24) instead of roughly doubling. Generalized, `buckets * table_layout.size` needs to be at least as big as `table_layout.ctrl_align`. For the cases I listed above, we'd get these new minimum bucket sizes: - x86_64 with SSE: 16 - aarch64: 8 This is a niche optimization. However, it also removes possible undefined behavior edge case in resize operations. In addition, it may be a useful property to utilize over-sized allocations (see rust-lang/hashbrown#523). --- src/raw/mod.rs | 66 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/src/raw/mod.rs b/src/raw/mod.rs index c8e8e2912..5cfd4a3dc 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -192,14 +192,40 @@ impl ProbeSeq { // Workaround for emscripten bug emscripten-core/emscripten-fastcomp#258 #[cfg_attr(target_os = "emscripten", inline(never))] #[cfg_attr(not(target_os = "emscripten"), inline)] -fn capacity_to_buckets(cap: usize) -> Option { +fn capacity_to_buckets(cap: usize, table_layout: TableLayout) -> Option { debug_assert_ne!(cap, 0); + // todo: what happens with ZSTs? + // Consider a small layout like TableLayout { size: 1, ctrl_align: 16 } on + // a platform with Group::WIDTH of 16 (like x86_64 with SSE2). For small + // bucket sizes, this ends up wasting quite a few bytes just to pad to the + // relatively larger ctrl_align: + // + // | capacity | buckets | bytes allocated | bytes per item | + // | -------- | ------- | --------------- | -------------- | + // | 3 | 4 | 36 | (Yikes!) 12.0 | + // | 7 | 8 | 40 | (Poor) 5.7 | + // | 14 | 16 | 48 | 3.4 | + // | 28 | 32 | 80 | 3.3 | + // + // The ratio of ctrl_align / size is used to set a minimum buckets so + // that padding to the alignment isn't dominating the total allocation. + let cap = { + let q = table_layout.ctrl_align / table_layout.size.max(1); + match q.checked_mul(7) { + None => cap, + Some(x) => { + let adjusted = x / 8; + adjusted.max(cap) + } + } + }; + // For small tables we require at least 1 empty bucket so that lookups are // guaranteed to terminate if an element doesn't exist in the table. if cap < 8 { // We don't bother with a table size of 2 buckets since that can only - // hold a single element. Instead we skip directly to a 4 bucket table + // hold a single element. Instead, skip directly to a 4 bucket table // which can hold 3 elements. return Some(if cap < 4 { 4 } else { 8 }); } @@ -1126,7 +1152,7 @@ impl RawTable { // elements. If the calculation overflows then the requested bucket // count must be larger than what we have right and nothing needs to be // done. - let min_buckets = match capacity_to_buckets(min_size) { + let min_buckets = match capacity_to_buckets(min_size, Self::TABLE_LAYOUT) { Some(buckets) => buckets, None => return, }; @@ -1257,14 +1283,8 @@ impl RawTable { /// * If `self.table.items != 0`, calling of this function with `capacity` /// equal to 0 (`capacity == 0`) results in [`undefined behavior`]. /// - /// * If `capacity_to_buckets(capacity) < Group::WIDTH` and - /// `self.table.items > capacity_to_buckets(capacity)` - /// calling this function results in [`undefined behavior`]. - /// - /// * If `capacity_to_buckets(capacity) >= Group::WIDTH` and - /// `self.table.items > capacity_to_buckets(capacity)` - /// calling this function are never return (will go into an - /// infinite loop). + /// * If `self.table.items > capacity_to_buckets(capacity, Self::TABLE_LAYOUT)` + /// calling this function are never return (will loop infinitely). /// /// See [`RawTableInner::find_insert_slot`] for more information. /// @@ -1782,8 +1802,8 @@ impl RawTableInner { // SAFETY: We checked that we could successfully allocate the new table, and then // initialized all control bytes with the constant `EMPTY` byte. unsafe { - let buckets = - capacity_to_buckets(capacity).ok_or_else(|| fallibility.capacity_overflow())?; + let buckets = capacity_to_buckets(capacity, table_layout) + .ok_or_else(|| fallibility.capacity_overflow())?; let result = Self::new_uninitialized(alloc, table_layout, buckets, fallibility)?; // SAFETY: We checked that the table is allocated and therefore the table already has @@ -4566,6 +4586,26 @@ impl RawExtractIf<'_, T, A> { mod test_map { use super::*; + #[test] + fn test_minimum_capacity_for_small_types() { + #[track_caller] + fn test_t() { + let raw_table: RawTable = RawTable::with_capacity(1); + let actual_buckets = raw_table.buckets(); + let min_buckets = Group::WIDTH / core::mem::size_of::(); + assert!( + actual_buckets >= min_buckets, + "expected at least {min_buckets} buckets, got {actual_buckets} buckets" + ); + } + + test_t::(); + + // This is only "small" for some platforms, like x86_64 with SSE2, but + // there's no harm in running it on other platforms. + test_t::(); + } + fn rehash_in_place(table: &mut RawTable, hasher: impl Fn(&T) -> u64) { unsafe { table.table.rehash_in_place(