Skip to content

Commit

Permalink
safekeeper: add evicted_timelines gauge. (#9318)
Browse files Browse the repository at this point in the history
showing total number of evicted timelines.
  • Loading branch information
arssher authored Oct 9, 2024
1 parent fc73971 commit a181392
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 14 deletions.
12 changes: 10 additions & 2 deletions safekeeper/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ use metrics::{
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
proto::MetricFamily,
register_histogram_vec, register_int_counter, register_int_counter_pair,
register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter,
IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge,
HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
};
use once_cell::sync::Lazy;

Expand Down Expand Up @@ -231,6 +231,14 @@ pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(||
.expect("Failed to register metric")
});

pub static NUM_EVICTED_TIMELINES: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"safekeeper_evicted_timelines",
"Number of currently evicted timelines"
)
.expect("Failed to register metric")
});

pub const LABEL_UNKNOWN: &str = "unknown";

/// Labels for traffic metrics.
Expand Down
8 changes: 7 additions & 1 deletion safekeeper/src/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -631,13 +631,19 @@ impl Timeline {

return Err(e);
}
self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter);
self.bootstrap(
shared_state,
conf,
broker_active_set,
partial_backup_rate_limiter,
);
Ok(())
}

/// Bootstrap new or existing timeline starting background tasks.
pub fn bootstrap(
self: &Arc<Timeline>,
_shared_state: &mut WriteGuardSharedState<'_>,
conf: &SafeKeeperConf,
broker_active_set: Arc<TimelinesSet>,
partial_backup_rate_limiter: RateLimiter,
Expand Down
6 changes: 5 additions & 1 deletion safekeeper/src/timeline_eviction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ use tracing::{debug, info, instrument, warn};
use utils::crashsafe::durable_rename;

use crate::{
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
metrics::{
EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES,
},
rate_limit::rand_duration,
timeline_manager::{Manager, StateSnapshot},
wal_backup,
Expand Down Expand Up @@ -93,6 +95,7 @@ impl Manager {
}

info!("successfully evicted timeline");
NUM_EVICTED_TIMELINES.inc();
}

/// Attempt to restore evicted timeline from remote storage; it must be
Expand Down Expand Up @@ -128,6 +131,7 @@ impl Manager {
tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);

info!("successfully restored evicted timeline");
NUM_EVICTED_TIMELINES.dec();
}
}

Expand Down
15 changes: 14 additions & 1 deletion safekeeper/src/timeline_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ use utils::lsn::Lsn;

use crate::{
control_file::{FileStorage, Storage},
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
metrics::{
MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS,
NUM_EVICTED_TIMELINES,
},
rate_limit::{rand_duration, RateLimiter},
recovery::recovery_main,
remove_wal::calc_horizon_lsn,
Expand Down Expand Up @@ -251,6 +254,11 @@ pub async fn main_task(
mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
}

// If timeline is evicted, reflect that in the metric.
if mgr.is_offloaded {
NUM_EVICTED_TIMELINES.inc();
}

let last_state = 'outer: loop {
MANAGER_ITERATIONS_TOTAL.inc();

Expand Down Expand Up @@ -367,6 +375,11 @@ pub async fn main_task(
mgr.update_wal_removal_end(res);
}

// If timeline is deleted while evicted decrement the gauge.
if mgr.tli.is_cancelled() && mgr.is_offloaded {
NUM_EVICTED_TIMELINES.dec();
}

mgr.set_status(Status::Finished);
}

Expand Down
12 changes: 10 additions & 2 deletions safekeeper/src/timelines_global_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,12 +165,14 @@ impl GlobalTimelines {
match Timeline::load_timeline(&conf, ttid) {
Ok(timeline) => {
let tli = Arc::new(timeline);
let mut shared_state = tli.write_shared_state().await;
TIMELINES_STATE
.lock()
.unwrap()
.timelines
.insert(ttid, tli.clone());
tli.bootstrap(
&mut shared_state,
&conf,
broker_active_set.clone(),
partial_backup_rate_limiter.clone(),
Expand Down Expand Up @@ -213,6 +215,7 @@ impl GlobalTimelines {
match Timeline::load_timeline(&conf, ttid) {
Ok(timeline) => {
let tli = Arc::new(timeline);
let mut shared_state = tli.write_shared_state().await;

// TODO: prevent concurrent timeline creation/loading
{
Expand All @@ -227,8 +230,13 @@ impl GlobalTimelines {
state.timelines.insert(ttid, tli.clone());
}

tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);

tli.bootstrap(
&mut shared_state,
&conf,
broker_active_set,
partial_backup_rate_limiter,
);
drop(shared_state);
Ok(tli)
}
// If we can't load a timeline, it's bad. Caller will figure it out.
Expand Down
42 changes: 35 additions & 7 deletions test_runner/regress/test_wal_acceptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2314,12 +2314,12 @@ def test_s3_eviction(
]
if delete_offloaded_wal:
neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")

env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_timeout": "100ms",
}
)
# make lagging_wal_timeout small to force pageserver quickly forget about
# safekeeper after it stops sending updates (timeline is deactivated) to
# make test faster. Won't be needed with
# https://github.com/neondatabase/neon/issues/8148 fixed.
initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)

n_timelines = 5

Expand Down Expand Up @@ -2407,9 +2407,37 @@ def test_s3_eviction(
and sk.log_contains("successfully restored evicted timeline")
for sk in env.safekeepers
)

assert event_metrics_seen

# test safekeeper_evicted_timelines metric
log.info("testing safekeeper_evicted_timelines metric")
# checkpoint pageserver to force remote_consistent_lsn update
for i in range(n_timelines):
ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
for ep in endpoints:
log.info(ep.is_running())
sk = env.safekeepers[0]

# all timelines must be evicted eventually
def all_evicted():
n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines")
assert n_evicted # make mypy happy
assert int(n_evicted) == n_timelines

wait_until(60, 0.5, all_evicted)
# restart should preserve the metric value
sk.stop().start()
wait_until(60, 0.5, all_evicted)
# and endpoint start should reduce is
endpoints[0].start()

def one_unevicted():
n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines")
assert n_evicted # make mypy happy
assert int(n_evicted) < n_timelines

wait_until(60, 0.5, one_unevicted)


# Test resetting uploaded partial segment state.
def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):
Expand Down

1 comment on commit a181392

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5173 tests run: 4953 passed, 3 failed, 217 skipped (full report)


Failures on Postgres 16

  • test_pgbench[neon-github-actions-selfhosted-45-10]: release-x86-64
  • test_pgbench[vanilla-github-actions-selfhosted-45-10]: release-x86-64
  • test_basebackup_with_high_slru_count[github-actions-selfhosted-10-13-30]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_pgbench[neon-release-pg16-github-actions-selfhosted-45-10] or test_pgbench[vanilla-release-pg16-github-actions-selfhosted-45-10] or test_basebackup_with_high_slru_count[release-pg16-github-actions-selfhosted-10-13-30]"

Code coverage* (full report)

  • functions: 31.3% (7508 of 23957 functions)
  • lines: 49.4% (60279 of 121939 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
a181392 at 2024-10-09T13:47:21.086Z :recycle:

Please sign in to comment.