diff --git a/src/backend/backend.h b/src/backend/backend.h index c84e00c02..cd0121e03 100644 --- a/src/backend/backend.h +++ b/src/backend/backend.h @@ -5,6 +5,7 @@ #include "chunkallocator.h" #include "commitrange.h" #include "commonconfig.h" +#include "decayrange.h" #include "empty_range.h" #include "globalrange.h" #include "largebuddyrange.h" @@ -144,9 +145,10 @@ namespace snmalloc using GlobalR = GlobalRange; # ifdef SNMALLOC_META_PROTECTED + using CommittedRange = + DecayRange, DefaultPal, Pagemap>; // Source for object allocations - using ObjectRange = - LargeBuddyRange, 21, 21, Pagemap>; + using ObjectRange = LargeBuddyRange; // Set up protected range for metadata using SubR = CommitRange, DefaultPal>; using MetaRange = @@ -155,8 +157,10 @@ namespace snmalloc # else // Source for object allocations and metadata // No separation between the two - using ObjectRange = SmallBuddyRange< - LargeBuddyRange, 21, 21, Pagemap>>; + using CommittedRange = + DecayRange, DefaultPal, Pagemap>; + using ObjectRange = + SmallBuddyRange>; using GlobalMetaRange = GlobalRange; # endif #endif diff --git a/src/backend/decayrange.h b/src/backend/decayrange.h new file mode 100644 index 000000000..8509e3b48 --- /dev/null +++ b/src/backend/decayrange.h @@ -0,0 +1,290 @@ +#pragma once + +#include "../ds/ptrwrap.h" +#include "../pal/pal_ds.h" +#include "largebuddyrange.h" + +namespace snmalloc +{ + template + class PagemapList + { + uintptr_t head = 0; + + PagemapList(uintptr_t head) : head(head) {} + + public: + constexpr PagemapList() = default; + + bool is_empty() const + { + return head == 0; + } + + PagemapList get_next() + { + SNMALLOC_ASSERT(!is_empty()); + auto next_field = &(Rep::ref(false, head)); + auto next = Rep::get(next_field); + return {next}; + } + + capptr::Chunk get_capability() + { + return capptr::Chunk(reinterpret_cast(head)); + } + + PagemapList cons(capptr::Chunk new_head_cap) + { + auto new_head = new_head_cap.unsafe_uintptr(); + auto field = &(Rep::ref(false, new_head)); + Rep::set(field, head); + return {new_head}; + } + }; + + /** + * Concurrent Stack + * + * This stack supports the following clients + * (push|pop)* || pop_all* || ... || pop_all* + * + * That is a single thread that can do push and pop, and other threads + * that do pop_all. pop_all if it returns a value, returns all of the + * stack, however, it may return nullptr if it races with either a push + * or a pop. + * + * The primary use case is single-threaded access, where other threads + * can attempt to steal all the values. + */ + template + class PagemapStack + { + static constexpr auto empty = PagemapList{}; + + private: + alignas(CACHELINE_SIZE) std::atomic> stack{}; + + PagemapList take() + { + if (stack.load(std::memory_order_relaxed).is_empty()) + return empty; + return stack.exchange(empty, std::memory_order_acquire); + } + + void replace(PagemapList new_head) + { + SNMALLOC_ASSERT(stack.load().is_empty()); + stack.store(new_head, std::memory_order_release); + } + + public: + constexpr PagemapStack() = default; + + void push(capptr::Chunk new_head_cap) + { + auto old_head = take(); + auto new_head = old_head.cons(new_head_cap); + replace(new_head); + } + + capptr::Chunk pop() + { + auto old_head = take(); + if (old_head.is_empty()) + return nullptr; + + auto next = old_head.get_next(); + auto result = old_head.get_capability(); + + replace(next); + return result; + } + + PagemapList pop_all() + { + return take(); + } + }; + + template + class DecayRange + { + typename ParentRange::State parent{}; + + /** + * How many slab sizes that can be provided. + */ + static constexpr size_t NUM_SLAB_SIZES = Pal::address_bits - MIN_CHUNK_BITS; + + /** + * Number of free stacks per chunk size that each allocator will use. + * For performance ideally a power of 2. We will return to the central + * pool anything that has not be used in the last NUM_EPOCHS - 1, where + * each epoch is separated by DecayMemoryTimerObject::PERIOD. + * I.e. if period is 500ms and num of epochs is 4, then we will return to + * the central pool anything not used for the last 1500-2000ms. + */ + static constexpr size_t NUM_EPOCHS = 4; + static_assert(bits::is_pow2(NUM_EPOCHS), "Code assumes power of two."); + + /** + * Stack of ranges that have been returned for reuse. + */ + ModArray< + NUM_SLAB_SIZES, + ModArray>>> + chunk_stack; + + /** + * Which is the current epoch to place dealloced chunks, and the + * first place we look for allocating chunks. + */ + static inline // alignas(CACHELINE_SIZE) + std::atomic + epoch{0}; + + /** + * Flag to ensure one-shot registration with the PAL. + */ + static inline std::atomic_bool registered_timer{false}; + + std::atomic_bool registered_local{false}; + + /** + * All activated DecayRanges. + */ + static inline std::atomic all_local{nullptr}; + + DecayRange* next{nullptr}; + + static void handle_decay_tick() + { + auto new_epoch = (epoch + 1) % NUM_EPOCHS; + // Flush old index for all threads. + auto curr = all_local.load(std::memory_order_acquire); + while (curr != nullptr) + { + for (size_t sc = 0; sc < NUM_SLAB_SIZES; sc++) + { + // Don't use ChunkRecord, store in pagemap. + auto old_stack = curr->chunk_stack[sc][new_epoch].pop_all(); + while (!old_stack.is_empty()) + { + auto next = old_stack.get_next(); + + curr->parent->dealloc_range( + old_stack.get_capability(), MIN_CHUNK_SIZE << sc); + + old_stack = next; + } + } + curr = curr->next; + } + + // Advance current index + epoch = new_epoch; + } + + class DecayMemoryTimerObject : public PalTimerObject + { + /*** + * Method for callback object to perform lazy decommit. + */ + static void process(PalTimerObject*) + { + handle_decay_tick(); + } + + // Specify that we notify the ChunkAllocator every 500ms. + static constexpr size_t PERIOD = 500; + + public: + constexpr DecayMemoryTimerObject() : PalTimerObject(&process, PERIOD) {} + }; + + static inline DecayMemoryTimerObject timer_object; + + public: + class State + { + DecayRange commit_range{}; + + public: + constexpr State() = default; + + DecayRange* operator->() + { + return &commit_range; + } + }; + + static constexpr bool Aligned = ParentRange::Aligned; + + constexpr DecayRange() = default; + + capptr::Chunk alloc_range(size_t size) + { + // Check local cache + + if constexpr (pal_supports) + { + auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS; + // Try local cache of chunks first + for (size_t e = 0; e < NUM_EPOCHS; e++) + { + auto p = chunk_stack[slab_sizeclass][(epoch - e) % NUM_EPOCHS].pop(); + + if (p != nullptr) + return p; + } + } + + capptr::Chunk result; + for (auto i = NUM_EPOCHS + 2; i > 0; i--) + { + // Nothing in local cache to allocate from parent. + result = parent->alloc_range(size); + if (result != nullptr) + { + return result; + } + + // We have run out of memory. + handle_decay_tick(); // Try to free some memory. + } + + return result; + } + + void dealloc_range(capptr::Chunk base, size_t size) + { + if constexpr (!pal_supports) + { + parent->dealloc_range(base, size); + return; + } + + if (!registered_timer.exchange(true)) + { + // Register with the PAL. + PAL::register_timer(&timer_object); + } + + // Check we have registered + if (!registered_local.exchange(true)) + { + // Add to the list of local states. + auto* head = all_local.load(); + do + { + next = head; + } while (!all_local.compare_exchange_strong(head, this)); + } + + auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS; + // Add to local cache. + chunk_stack[slab_sizeclass][epoch].push(base); + } + }; +} // namespace snmalloc