From 090a6443929915ed92924b6ae2077d0b229f118d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 27 Sep 2023 14:18:05 +0200 Subject: [PATCH] metrics for resident & remote physical size without tenant/timeline dimension (#5389) So that we can compute worst-case /storage size dashboard panel more cheaply. --- pageserver/src/metrics.rs | 97 +++++++++++++++++-- .../src/tenant/remote_timeline_client.rs | 4 +- pageserver/src/tenant/timeline.rs | 13 +-- .../src/tenant/timeline/layer_manager.rs | 2 +- 4 files changed, 94 insertions(+), 22 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b085176f189e..de94eb81527f 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -291,6 +291,14 @@ static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_resident_physical_size_global", + "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions." + ) + .expect("failed to define a metric") +}); + static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_remote_physical_size", @@ -301,6 +309,14 @@ static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_remote_physical_size_global", + "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions." + ) + .expect("failed to define a metric") +}); + pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_remote_ondemand_downloaded_layers_total", @@ -1209,7 +1225,7 @@ pub struct TimelineMetrics { pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, - pub resident_physical_size_gauge: UIntGauge, + resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -1287,10 +1303,29 @@ impl TimelineMetrics { } pub fn record_new_file_metrics(&self, sz: u64) { - self.resident_physical_size_gauge.add(sz); + self.resident_physical_size_add(sz); self.num_persistent_files_created.inc_by(1); self.persistent_bytes_written.inc_by(sz); } + + pub fn resident_physical_size_sub(&self, sz: u64) { + self.resident_physical_size_gauge.sub(sz); + crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz); + } + + pub fn resident_physical_size_add(&self, sz: u64) { + self.resident_physical_size_gauge.add(sz); + crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz); + } + + pub fn resident_physical_size_set(&self, sz: u64) { + self.resident_physical_size_gauge.set(sz); + crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz); + } + + pub fn resident_physical_size_get(&self) -> u64 { + self.resident_physical_size_gauge.get() + } } impl Drop for TimelineMetrics { @@ -1298,7 +1333,10 @@ impl Drop for TimelineMetrics { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); - let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + { + RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + } let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); @@ -1352,10 +1390,43 @@ use std::time::{Duration, Instant}; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; +/// Maintain a per timeline gauge in addition to the global gauge. +struct PerTimelineRemotePhysicalSizeGauge { + last_set: u64, + gauge: UIntGauge, +} + +impl PerTimelineRemotePhysicalSizeGauge { + fn new(per_timeline_gauge: UIntGauge) -> Self { + Self { + last_set: per_timeline_gauge.get(), + gauge: per_timeline_gauge, + } + } + fn set(&mut self, sz: u64) { + self.gauge.set(sz); + if sz < self.last_set { + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz); + } else { + REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set); + }; + self.last_set = sz; + } + fn get(&self) -> u64 { + self.gauge.get() + } +} + +impl Drop for PerTimelineRemotePhysicalSizeGauge { + fn drop(&mut self) { + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set); + } +} + pub struct RemoteTimelineClientMetrics { tenant_id: String, timeline_id: String, - remote_physical_size_gauge: Mutex>, + remote_physical_size_gauge: Mutex>, calls_unfinished_gauge: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, @@ -1373,18 +1444,24 @@ impl RemoteTimelineClientMetrics { } } - pub fn remote_physical_size_gauge(&self) -> UIntGauge { + pub(crate) fn remote_physical_size_set(&self, sz: u64) { let mut guard = self.remote_physical_size_gauge.lock().unwrap(); - guard - .get_or_insert_with(|| { + let gauge = guard.get_or_insert_with(|| { + PerTimelineRemotePhysicalSizeGauge::new( REMOTE_PHYSICAL_SIZE .get_metric_with_label_values(&[ &self.tenant_id.to_string(), &self.timeline_id.to_string(), ]) - .unwrap() - }) - .clone() + .unwrap(), + ) + }); + gauge.set(sz); + } + + pub(crate) fn remote_physical_size_get(&self) -> u64 { + let guard = self.remote_physical_size_gauge.lock().unwrap(); + guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0) } pub fn remote_operation_time( diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 4e495d9bb2ab..ee99151ef26b 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -453,11 +453,11 @@ impl RemoteTimelineClient { } else { 0 }; - self.metrics.remote_physical_size_gauge().set(size); + self.metrics.remote_physical_size_set(size); } pub fn get_remote_physical_size(&self) -> u64 { - self.metrics.remote_physical_size_gauge().get() + self.metrics.remote_physical_size_get() } // diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4fa5039d7946..9b62ba1c5023 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -559,7 +559,7 @@ impl Timeline { } pub fn resident_physical_size(&self) -> u64 { - self.metrics.resident_physical_size_gauge.get() + self.metrics.resident_physical_size_get() } /// @@ -1309,10 +1309,7 @@ impl Timeline { // will treat the file as a local layer again, count it towards resident size, // and it'll be like the layer removal never happened. // The bump in resident size is perhaps unexpected but overall a robust behavior. - self.metrics - .resident_physical_size_gauge - .sub(layer_file_size); - + self.metrics.resident_physical_size_sub(layer_file_size); self.metrics.evictions.inc(); if let Some(delta) = local_layer_residence_duration { @@ -1846,9 +1843,7 @@ impl Timeline { "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size ); - self.metrics - .resident_physical_size_gauge - .set(total_physical_size); + self.metrics.resident_physical_size_set(total_physical_size); timer.stop_and_record(); Ok(()) @@ -4398,7 +4393,7 @@ impl Timeline { // XXX the temp file is still around in Err() case // and consumes space until we clean up upon pageserver restart. - self_clone.metrics.resident_physical_size_gauge.add(*size); + self_clone.metrics.resident_physical_size_add(*size); // Download complete. Replace the RemoteLayer with the corresponding // Delta- or ImageLayer in the layer map. diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 3c88d31f24c2..0a387bd7797c 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -263,7 +263,7 @@ impl LayerManager { let desc = layer.layer_desc(); if !layer.is_remote_layer() { layer.delete_resident_layer_file()?; - metrics.resident_physical_size_gauge.sub(desc.file_size); + metrics.resident_physical_size_sub(desc.file_size); } // TODO Removing from the bottom of the layer map is expensive.