From 5fdfb60354794e5648f91455d4f4a8953e623c05 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 21 Jun 2024 16:52:56 +0100 Subject: [PATCH] pageserver: add supplementary branch usag stats --- libs/pageserver_api/src/models.rs | 10 +++++++++ pageserver/src/http/routes.rs | 4 ++++ pageserver/src/metrics.rs | 35 +++++++++++++++++++++++++++++++ pageserver/src/tenant.rs | 27 ++++++++++++++++++++++++ pageserver/src/tenant/timeline.rs | 15 +++++++++++++ test_runner/fixtures/metrics.py | 2 ++ 6 files changed, 93 insertions(+) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 959e161c167a..92289537613d 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -661,6 +661,16 @@ pub struct TimelineInfo { pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, + /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes + /// beyond the branch's branch point, we only count up to the branch point. + pub pitr_history_size: u64, + + /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any + /// ancestor data used by this branch would have been retained anyway). If this is false, then + /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would + /// otherwise be able to GC. + pub within_ancestor_pitr: bool, + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f726ba115d83..6a6f17604dee 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -406,6 +406,8 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); + let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -426,6 +428,8 @@ async fn build_timeline_info_common( directory_entries_counts: timeline.get_directory_metrics().to_vec(), current_physical_size, current_logical_size_non_incremental: None, + pitr_history_size, + within_ancestor_pitr, timeline_dir_layer_file_size_sum: None, wal_source_connstr, last_received_msg_lsn, diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 9cd7ffa0426c..1f7f2badd81d 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_pitr_history_size", + "Data written since PITR cutoff on this timeline", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_archive_size", + "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static STANDBY_HORIZON: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_standby_horizon", @@ -2102,6 +2120,8 @@ pub(crate) struct TimelineMetrics { pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, + pub pitr_history_size: UIntGauge, + pub archival_size: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -2175,6 +2195,15 @@ impl TimelineMetrics { let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + + let pitr_history_size = PITR_HISTORY_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + + let archival_size = TIMELINE_ARCHIVE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2227,6 +2256,8 @@ impl TimelineMetrics { find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, + pitr_history_size, + archival_size, standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, @@ -2284,6 +2315,10 @@ impl TimelineMetrics { if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } + + let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 89bf89471cef..b99bcb4c2876 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2868,6 +2868,7 @@ impl Tenant { { let mut target = timeline.gc_info.write().unwrap(); + // Cull any expired leases let now = SystemTime::now(); target.leases.retain(|_, lease| !lease.is_expired(&now)); @@ -2876,6 +2877,31 @@ impl Tenant { .valid_lsn_lease_count_gauge .set(target.leases.len() as u64); + // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR + if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { + if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { + target.within_ancestor_pitr = + timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr; + } + } + + // Update metrics that depend on GC state + timeline + .metrics + .archival_size + .set(if target.within_ancestor_pitr { + timeline.metrics.current_logical_size_gauge.get() + } else { + 0 + }); + timeline.metrics.pitr_history_size.set( + timeline + .get_last_record_lsn() + .checked_sub(target.cutoffs.pitr) + .unwrap_or(Lsn(0)) + .0, + ); + match gc_cutoffs.remove(&timeline.timeline_id) { Some(cutoffs) => { target.retain_lsns = branchpoints; @@ -7046,6 +7072,7 @@ mod tests { horizon: Lsn(0x30), }, leases: Default::default(), + within_ancestor_pitr: false, }; } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index de9361d72103..2d2ae9c036fc 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -463,6 +463,9 @@ pub(crate) struct GcInfo { /// Leases granted to particular LSNs. pub(crate) leases: BTreeMap, + + /// Whether our branch point is within our ancestor's PITR interval (for cost estimation) + pub(crate) within_ancestor_pitr: bool, } impl GcInfo { @@ -851,6 +854,18 @@ impl Timeline { .map(|ancestor| ancestor.timeline_id) } + /// Get the bytes written since the PITR cutoff on this branch, and + /// whether this branch's ancestor_lsn is within its parent's PITR. + pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { + let gc_info = self.gc_info.read().unwrap(); + let history = self + .get_last_record_lsn() + .checked_sub(gc_info.cutoffs.pitr) + .unwrap_or(Lsn(0)) + .0; + (history, gc_info.within_ancestor_pitr) + } + /// Lock and get timeline's GC cutoff pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { self.latest_gc_cutoff_lsn.read() diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 41fa8e679f28..c019cbbc7790 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -144,6 +144,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]: "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", + "pageserver_archive_size", + "pageserver_pitr_history_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total",