Skip to content

Commit

Permalink
metric: add started and killed walredo processes counter (#5809)
Browse files Browse the repository at this point in the history
In OOM situations, knowing exactly how many walredo processes there were
at a time would help afterwards to understand why was pageserver OOM
killed. Add `pageserver_wal_redo_process_total` metric to keep track of
total wal redo process started, shutdown and killed since pageserver
start.

Closes #5722

---------

Signed-off-by: Rahul Modpur <[email protected]>
Co-authored-by: Joonas Koivunen <[email protected]>
Co-authored-by: Christian Schwarz <[email protected]>
  • Loading branch information
3 people authored and jcsp committed Nov 14, 2023
1 parent 4aa1f17 commit 8ef836a
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 9 deletions.
40 changes: 40 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1252,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
.unwrap()
});

pub(crate) struct WalRedoProcessCounters {
pub(crate) started: IntCounter,
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
}

#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
pub(crate) enum WalRedoKillCause {
WalRedoProcessDrop,
NoLeakChildDrop,
Startup,
}

impl Default for WalRedoProcessCounters {
fn default() -> Self {
let started = register_int_counter!(
"pageserver_wal_redo_process_started_total",
"Number of WAL redo processes started",
)
.unwrap();

let killed = register_int_counter_vec!(
"pageserver_wal_redo_process_stopped_total",
"Number of WAL redo processes stopped",
&["cause"],
)
.unwrap();
Self {
started,
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
let cause_str: &'static str = cause.into();
killed.with_label_values(&[cause_str])
})),
}
}
}

pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
Lazy::new(WalRedoProcessCounters::default);

/// Similar to `prometheus::HistogramTimer` but does not record on drop.
pub struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics,
Expand Down
22 changes: 13 additions & 9 deletions pageserver/src/walredo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};

use crate::config::PageServerConf;
use crate::metrics::{
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
};
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
use crate::repository::Key;
Expand Down Expand Up @@ -662,10 +663,10 @@ impl WalRedoProcess {
.close_fds()
.spawn_no_leak_child(tenant_id)
.context("spawn process")?;

WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait();
child.kill_and_wait(WalRedoKillCause::Startup);
});

let stdin = child.stdin.take().unwrap();
Expand Down Expand Up @@ -996,7 +997,7 @@ impl Drop for WalRedoProcess {
self.child
.take()
.expect("we only do this once")
.kill_and_wait();
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
self.stderr_logger_cancel.cancel();
// no way to wait for stderr_logger_task from Drop because that is async only
}
Expand Down Expand Up @@ -1032,16 +1033,19 @@ impl NoLeakChild {
})
}

fn kill_and_wait(mut self) {
fn kill_and_wait(mut self, cause: WalRedoKillCause) {
let child = match self.child.take() {
Some(child) => child,
None => return,
};
Self::kill_and_wait_impl(child);
Self::kill_and_wait_impl(child, cause);
}

#[instrument(skip_all, fields(pid=child.id()))]
fn kill_and_wait_impl(mut child: Child) {
#[instrument(skip_all, fields(pid=child.id(), ?cause))]
fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
scopeguard::defer! {
WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
}
let res = child.kill();
if let Err(e) = res {
// This branch is very unlikely because:
Expand Down Expand Up @@ -1086,7 +1090,7 @@ impl Drop for NoLeakChild {
// This thread here is going to outlive of our dropper.
let span = tracing::info_span!("walredo", %tenant_id);
let _entered = span.enter();
Self::kill_and_wait_impl(child);
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
})
.await
});
Expand Down

0 comments on commit 8ef836a

Please sign in to comment.