Skip to content

Commit

Permalink
page cache: improve eviction-related metrics (#6077)
Browse files Browse the repository at this point in the history
These changes help with identifying thrashing.

The existing `pageserver_page_cache_find_victim_iters_total` is already
useful, but, it doesn't tell us how many individual find_victim() calls
are happening, only how many clock-LRU steps happened in the entire
system,
without info about whether we needed to actually evict other data vs
just scan for a long time, e.g., because the cache is large.

The changes in this PR allows us to
1. count each possible outcome separately, esp evictions
2. compute mean iterations/outcome

I don't think anyone except me was paying close attention to
`pageserver_page_cache_find_victim_iters_total` before, so,
I think the slight behavior change of also counting iterations
for the 'iters exceeded' case is fine.

refs neondatabase/cloud#8351
refs #5479
  • Loading branch information
problame authored Dec 8, 2023
1 parent 5e98855 commit f5b9af6
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 12 deletions.
65 changes: 57 additions & 8 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,63 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
},
});

pub(crate) mod page_cache_eviction_metrics {
use std::num::NonZeroUsize;

use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;

#[derive(Clone, Copy)]
pub(crate) enum Outcome {
FoundSlotUnused { iters: NonZeroUsize },
FoundSlotEvicted { iters: NonZeroUsize },
ItersExceeded { iters: NonZeroUsize },
}

static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
&["outcome"],
)
.expect("failed to define a metric")
});

static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_calls",
"Incremented at the end of each find_victim() call.\
Filter by outcome to get e.g., eviction rate.",
&["outcome"]
)
.unwrap()
});

pub(crate) fn observe(outcome: Outcome) {
macro_rules! dry {
($label:literal, $iters:expr) => {{
static LABEL: &'static str = $label;
static ITERS_TOTAL: Lazy<IntCounter> =
Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
static CALLS: Lazy<IntCounter> =
Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
ITERS_TOTAL.inc_by(($iters.get()) as u64);
CALLS.inc();
}};
}
match outcome {
Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
Outcome::FoundSlotEvicted { iters } => {
dry!("found_evicted", iters)
}
Outcome::ItersExceeded { iters } => {
dry!("err_iters_exceeded", iters);
super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
}
}
}
}

pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_cache_acquire_pinned_slot_seconds",
Expand All @@ -294,14 +351,6 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
.expect("failed to define a metric")
});

pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
)
.expect("failed to define a metric")
});

static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"page_cache_errors_total",
Expand Down
24 changes: 20 additions & 4 deletions pageserver/src/page_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,11 @@ use utils::{
lsn::Lsn,
};

use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
use crate::{
context::RequestContext,
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
repository::Key,
};

static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
const TEST_PAGE_CACHE_SIZE: usize = 50;
Expand Down Expand Up @@ -897,8 +901,10 @@ impl PageCache {
// Note that just yielding to tokio during iteration without such
// priority boosting is likely counter-productive. We'd just give more opportunities
// for B to bump usage count, further starving A.
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::EvictIterLimit,
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::ItersExceeded {
iters: iters.try_into().unwrap(),
},
);
anyhow::bail!("exceeded evict iter limit");
}
Expand All @@ -909,8 +915,18 @@ impl PageCache {
// remove mapping for old buffer
self.remove_mapping(old_key);
inner.key = None;
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
iters: iters.try_into().unwrap(),
},
);
} else {
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotUnused {
iters: iters.try_into().unwrap(),
},
);
}
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
return Ok((slot_idx, inner));
}
}
Expand Down

1 comment on commit f5b9af6

@github-actions
Copy link

@github-actions github-actions bot commented on f5b9af6 Dec 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2216 tests run: 2125 passed, 1 failed, 90 skipped (full report)


Failures on Postgres 15

  • test_delete_tenant_exercise_crash_safety_failpoints[real_s3-tenant-delete-before-polling-ongoing-deletions-False-Check.RETRY_WITHOUT_RESTART]: debug
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_delete_tenant_exercise_crash_safety_failpoints[debug-pg15-real_s3-tenant-delete-before-polling-ongoing-deletions-False-Check.RETRY_WITHOUT_RESTART]"
Flaky tests (2)

Postgres 16

  • test_crafted_wal_end[last_wal_record_crossing_segment]: release

Postgres 15

  • test_statvfs_pressure_usage: debug

Test coverage report is not available

The comment gets automatically updated with the latest test results
f5b9af6 at 2023-12-08T16:21:40.143Z :recycle:

Please sign in to comment.