diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs index 982add927d59..e6d950487de0 100644 --- a/pageserver/src/aux_file.rs +++ b/pageserver/src/aux_file.rs @@ -1,8 +1,6 @@ -use std::sync::{ - atomic::{AtomicIsize, AtomicUsize}, - Arc, -}; +use std::sync::Arc; +use ::metrics::IntGauge; use bytes::{Buf, BufMut, Bytes}; use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; use tracing::warn; @@ -147,12 +145,14 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result> { /// An estimation of the size of aux files. pub struct AuxFileSizeEstimator { + aux_file_size_gauge: IntGauge, size: Arc>>, } impl AuxFileSizeEstimator { - pub fn new() -> Self { + pub fn new(aux_file_size_gauge: IntGauge) -> Self { Self { + aux_file_size_gauge, size: Arc::new(std::sync::Mutex::new(None)), } } @@ -160,12 +160,14 @@ impl AuxFileSizeEstimator { pub fn on_base_backup(&self, new_size: usize) { let mut guard = self.size.lock().unwrap(); *guard = Some(new_size as isize); + self.report(new_size as isize); } pub fn on_add(&self, file_size: usize) { let mut guard = self.size.lock().unwrap(); if let Some(size) = &mut *guard { *size += file_size as isize; + self.report(*size); } } @@ -173,6 +175,7 @@ impl AuxFileSizeEstimator { let mut guard = self.size.lock().unwrap(); if let Some(size) = &mut *guard { *size -= file_size as isize; + self.report(*size); } } @@ -180,10 +183,13 @@ impl AuxFileSizeEstimator { let mut guard = self.size.lock().unwrap(); if let Some(size) = &mut *guard { *size += new_size as isize - old_size as isize; + self.report(*size); } } - pub fn report(&self) {} + pub fn report(&self, size: isize) { + self.aux_file_size_gauge.set(size as i64); + } } #[cfg(test)] diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 256f2f334c1d..b27bfb43b077 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -585,6 +585,15 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define current logical size metric") }); +static AUX_FILE_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_aux_file_estimated_size", + "The size of all aux files for a timeline in aux file v2 store.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod initial_logical_size { use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; @@ -2115,6 +2124,7 @@ pub(crate) struct TimelineMetrics { resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, + pub aux_file_size_gauge: IntGauge, pub directory_entries_count_gauge: Lazy UIntGauge>>, pub evictions: IntCounter, pub evictions_with_low_residence_duration: std::sync::RwLock, @@ -2187,6 +2197,9 @@ impl TimelineMetrics { let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let aux_file_size_gauge = AUX_FILE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065 let directory_entries_count_gauge_closure = { let tenant_shard_id = *tenant_shard_id; @@ -2224,6 +2237,7 @@ impl TimelineMetrics { last_record_gauge, resident_physical_size_gauge, current_logical_size_gauge, + aux_file_size_gauge, directory_entries_count_gauge, evictions, evictions_with_low_residence_duration: std::sync::RwLock::new( @@ -2264,6 +2278,7 @@ impl TimelineMetrics { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 0c608c6a7f9f..1c90b89d0026 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -704,9 +704,9 @@ impl Timeline { let v = v.context("get value")?; let v = aux_file::decode_file_value_bytes(&v).context("value decode")?; for (fname, content) in v { - result.insert(fname, content); sz += fname.len(); sz += content.len(); + result.insert(fname, content); } } self.aux_file_size_estimator.on_base_backup(sz); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ce22f003525b..1b28b1116866 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2160,6 +2160,16 @@ impl Timeline { }; Arc::new_cyclic(|myself| { + let metrics = TimelineMetrics::new( + &tenant_shard_id, + &timeline_id, + crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( + "mtime", + evictions_low_residence_duration_metric_threshold, + ), + ); + let aux_file_metrics = metrics.aux_file_size_gauge.clone(); + let mut result = Timeline { conf, tenant_conf, @@ -2191,14 +2201,7 @@ impl Timeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new( - &tenant_shard_id, - &timeline_id, - crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( - "mtime", - evictions_low_residence_duration_metric_threshold, - ), - ), + metrics, query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new( &tenant_shard_id, @@ -2263,7 +2266,7 @@ impl Timeline { n_deltas: 0, }), - aux_file_size_estimator: AuxFileSizeEstimator::new(), + aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 7d34e12ca333..8fa67e75c9ab 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -149,6 +149,7 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]: "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", + "pageserver_aux_file_estimated_size", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, # "pageserver_directory_entries_count", -- only used if above a certain threshold # "pageserver_broken_tenants_count" -- used only for broken