From 04b7e56541f9932e0b4647472560176a88abae2d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 5 Jul 2024 22:17:05 +0200 Subject: [PATCH] pageserver_live_connections: track as counter pair (#8227) Generally counter pairs are preferred over gauges. In this case, I found myself asking what the typical rate of accepted page_service connections on a pageserver is, and I couldn't answer it with the gauge metric. There are a few dashboards using this metric: https://github.com/search?q=repo%3Aneondatabase%2Fgrafana-dashboard-export%20pageserver_live_connections&type=code I'll convert them to use the new metric once this PR reaches prod. refs https://github.com/neondatabase/neon/issues/7427 --- pageserver/src/metrics.rs | 10 ++++++---- pageserver/src/page_service.rs | 13 ++++--------- .../timeline/walreceiver/walreceiver_connection.rs | 13 ++++--------- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9e9fe7fbb834..59b729363147 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1456,10 +1456,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { } } -pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_live_connections", - "Number of live network connections", +pub(crate) static LIVE_CONNECTIONS: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_live_connections_started", + "Number of network connections that we started handling", + "pageserver_live_connections_finished", + "Number of network connections that we finished handling", &["pageserver_connection_kind"] ) .expect("failed to define a metric") diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index a440ad63785b..07365b5eb85e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; -use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT}; +use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; use crate::pgdatadir_mapping::Version; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; @@ -215,14 +215,9 @@ async fn page_service_conn_main( auth_type: AuthType, connection_ctx: RequestContext, ) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on task exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } + let _guard = LIVE_CONNECTIONS + .with_label_values(&["page_service"]) + .guard(); socket .set_nodelay(true) diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index c6ee6b90c4d1..a66900522af4 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; use crate::{ context::RequestContext, - metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, + metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, @@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection( .instrument(tracing::info_span!("poller")), ); - // Immediately increment the gauge, then create a job to decrement it on task exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } + let _guard = LIVE_CONNECTIONS + .with_label_values(&["wal_receiver"]) + .guard(); let identify = identify_system(&replication_client).await?; info!("{identify:?}");