From f5b9af6ac7f8698d3bf724c7d2ef59d4a9c17e74 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 8 Dec 2023 16:27:21 +0100 Subject: [PATCH] page cache: improve eviction-related metrics (#6077) These changes help with identifying thrashing. The existing `pageserver_page_cache_find_victim_iters_total` is already useful, but, it doesn't tell us how many individual find_victim() calls are happening, only how many clock-LRU steps happened in the entire system, without info about whether we needed to actually evict other data vs just scan for a long time, e.g., because the cache is large. The changes in this PR allows us to 1. count each possible outcome separately, esp evictions 2. compute mean iterations/outcome I don't think anyone except me was paying close attention to `pageserver_page_cache_find_victim_iters_total` before, so, I think the slight behavior change of also counting iterations for the 'iters exceeded' case is fine. refs https://github.com/neondatabase/cloud/issues/8351 refs https://github.com/neondatabase/neon/issues/5479 --- pageserver/src/metrics.rs | 65 +++++++++++++++++++++++++++++++----- pageserver/src/page_cache.rs | 24 ++++++++++--- 2 files changed, 77 insertions(+), 12 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 80113220ae50..dc7f89cf3a2d 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -285,6 +285,63 @@ pub static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheS }, }); +pub(crate) mod page_cache_eviction_metrics { + use std::num::NonZeroUsize; + + use metrics::{register_int_counter_vec, IntCounter, IntCounterVec}; + use once_cell::sync::Lazy; + + #[derive(Clone, Copy)] + pub(crate) enum Outcome { + FoundSlotUnused { iters: NonZeroUsize }, + FoundSlotEvicted { iters: NonZeroUsize }, + ItersExceeded { iters: NonZeroUsize }, + } + + static ITERS_TOTAL_VEC: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_page_cache_find_victim_iters_total", + "Counter for the number of iterations in the find_victim loop", + &["outcome"], + ) + .expect("failed to define a metric") + }); + + static CALLS_VEC: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_page_cache_find_victim_calls", + "Incremented at the end of each find_victim() call.\ + Filter by outcome to get e.g., eviction rate.", + &["outcome"] + ) + .unwrap() + }); + + pub(crate) fn observe(outcome: Outcome) { + macro_rules! dry { + ($label:literal, $iters:expr) => {{ + static LABEL: &'static str = $label; + static ITERS_TOTAL: Lazy = + Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL])); + static CALLS: Lazy = + Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL])); + ITERS_TOTAL.inc_by(($iters.get()) as u64); + CALLS.inc(); + }}; + } + match outcome { + Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters), + Outcome::FoundSlotEvicted { iters } => { + dry!("found_evicted", iters) + } + Outcome::ItersExceeded { iters } => { + dry!("err_iters_exceeded", iters); + super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit); + } + } + } +} + pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_page_cache_acquire_pinned_slot_seconds", @@ -294,14 +351,6 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy = Lazy::n .expect("failed to define a metric") }); -pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy = Lazy::new(|| { - register_int_counter!( - "pageserver_page_cache_find_victim_iters_total", - "Counter for the number of iterations in the find_victim loop", - ) - .expect("failed to define a metric") -}); - static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| { register_int_counter_vec!( "page_cache_errors_total", diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 07020577661f..dbd85d2dcf86 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -88,7 +88,11 @@ use utils::{ lsn::Lsn, }; -use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key}; +use crate::{ + context::RequestContext, + metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, + repository::Key, +}; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 50; @@ -897,8 +901,10 @@ impl PageCache { // Note that just yielding to tokio during iteration without such // priority boosting is likely counter-productive. We'd just give more opportunities // for B to bump usage count, further starving A. - crate::metrics::page_cache_errors_inc( - crate::metrics::PageCacheErrorKind::EvictIterLimit, + page_cache_eviction_metrics::observe( + page_cache_eviction_metrics::Outcome::ItersExceeded { + iters: iters.try_into().unwrap(), + }, ); anyhow::bail!("exceeded evict iter limit"); } @@ -909,8 +915,18 @@ impl PageCache { // remove mapping for old buffer self.remove_mapping(old_key); inner.key = None; + page_cache_eviction_metrics::observe( + page_cache_eviction_metrics::Outcome::FoundSlotEvicted { + iters: iters.try_into().unwrap(), + }, + ); + } else { + page_cache_eviction_metrics::observe( + page_cache_eviction_metrics::Outcome::FoundSlotUnused { + iters: iters.try_into().unwrap(), + }, + ); } - crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64); return Ok((slot_idx, inner)); } }