Skip to content

Commit

Permalink
pageserver_live_connections: track as counter pair (#8227)
Browse files Browse the repository at this point in the history
Generally counter pairs are preferred over gauges.
In this case, I found myself asking what the typical rate of accepted
page_service connections on a pageserver is, and I couldn't answer it
with the gauge metric.

There are a few dashboards using this metric:

https://github.com/search?q=repo%3Aneondatabase%2Fgrafana-dashboard-export%20pageserver_live_connections&type=code

I'll convert them to use the new metric once this PR reaches prod.

refs #7427
  • Loading branch information
problame authored Jul 5, 2024
1 parent 13522fb commit f0d29a0
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 22 deletions.
10 changes: 6 additions & 4 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1456,10 +1456,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
}
}

pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_live_connections",
"Number of live network connections",
pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"pageserver_live_connections_started",
"Number of network connections that we started handling",
"pageserver_live_connections_finished",
"Number of network connections that we finished handling",
&["pageserver_connection_kind"]
)
.expect("failed to define a metric")
Expand Down
13 changes: 4 additions & 9 deletions pageserver/src/page_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
use crate::pgdatadir_mapping::Version;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
Expand Down Expand Up @@ -215,14 +215,9 @@ async fn page_service_conn_main(
auth_type: AuthType,
connection_ctx: RequestContext,
) -> anyhow::Result<()> {
// Immediately increment the gauge, then create a job to decrement it on task exit.
// One of the pros of `defer!` is that this will *most probably*
// get called, even in presence of panics.
let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
gauge.inc();
scopeguard::defer! {
gauge.dec();
}
let _guard = LIVE_CONNECTIONS
.with_label_values(&["page_service"])
.guard();

socket
.set_nodelay(true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
use super::TaskStateUpdate;
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
Expand Down Expand Up @@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection(
.instrument(tracing::info_span!("poller")),
);

// Immediately increment the gauge, then create a job to decrement it on task exit.
// One of the pros of `defer!` is that this will *most probably*
// get called, even in presence of panics.
let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
gauge.inc();
scopeguard::defer! {
gauge.dec();
}
let _guard = LIVE_CONNECTIONS
.with_label_values(&["wal_receiver"])
.guard();

let identify = identify_system(&replication_client).await?;
info!("{identify:?}");
Expand Down

1 comment on commit f0d29a0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3129 tests run: 3002 passed, 0 failed, 127 skipped (full report)


Flaky tests (2)

Postgres 14

  • test_subscriber_restart: release
  • test_delete_timeline_client_hangup: release

Code coverage* (full report)

  • functions: 32.6% (6935 of 21275 functions)
  • lines: 50.0% (54497 of 108969 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
f0d29a0 at 2024-07-05T21:44:02.249Z :recycle:

Please sign in to comment.