From 4c7f8818150d20107b806ee4a9250079f02e7e92 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Wed, 18 Dec 2024 16:13:46 -0500 Subject: [PATCH] fix(pageserver): ensure GC computes time cutoff using the same start time Signed-off-by: Alex Chi Z --- Cargo.lock | 6 +++--- pageserver/src/tenant.rs | 6 +++++- pageserver/src/tenant/timeline.rs | 5 +++-- pageserver/src/tenant/timeline/compaction.rs | 20 ++++++++++++++++++-- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d9ac167042ad..1a64bc2c61c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -884,7 +884,7 @@ dependencies = [ "addr2line", "cfg-if", "libc", - "miniz_oxide 0.8.0", + "miniz_oxide 0.8.2", "object", "rustc-demangle", "windows-targets 0.52.6", @@ -3412,9 +3412,9 @@ dependencies = [ [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" dependencies = [ "adler2", ] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2e4c47c6e40f..8fd175527353 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4482,13 +4482,17 @@ impl Tenant { let mut gc_cutoffs: HashMap = HashMap::with_capacity(timelines.len()); + // Ensures all timelines use the same start time when computing the time cutoff. + let now = SystemTime::now(); for timeline in timelines.iter() { let cutoff = timeline .get_last_record_lsn() .checked_sub(horizon) .unwrap_or(Lsn(0)); - let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?; + let cutoffs = timeline + .find_gc_cutoffs(now, cutoff, pitr, cancel, ctx) + .await?; let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs); assert!(old.is_none()); } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e71cb4db80b9..0668f66c8d47 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4837,6 +4837,7 @@ impl Timeline { async fn find_gc_time_cutoff( &self, + now: SystemTime, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, @@ -4844,7 +4845,6 @@ impl Timeline { debug_assert_current_span_has_tenant_and_timeline_id(); if self.shard_identity.is_shard_zero() { // Shard Zero has SLRU data and can calculate the PITR time -> LSN mapping itself - let now = SystemTime::now(); let time_range = if pitr == Duration::ZERO { humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid") } else { @@ -4930,6 +4930,7 @@ impl Timeline { #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] pub(super) async fn find_gc_cutoffs( &self, + now: SystemTime, space_cutoff: Lsn, pitr: Duration, cancel: &CancellationToken, @@ -4957,7 +4958,7 @@ impl Timeline { // - if PITR interval is set, then this is our cutoff. // - if PITR interval is not set, then we do a lookup // based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases. - let time_cutoff = self.find_gc_time_cutoff(pitr, cancel, ctx).await?; + let time_cutoff = self.find_gc_time_cutoff(now, pitr, cancel, ctx).await?; Ok(match (pitr, time_cutoff) { (Duration::ZERO, Some(time_cutoff)) => { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a4e8f3952265..629f3a624692 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1798,6 +1798,22 @@ impl Timeline { Ok(()) } + /// Get a watermark for gc-compaction, that is the lowest LSN that we can use as the `gc_horizon` for + /// the compaction algorithm. It is min(space_cutoff, time_cutoff, latest_gc_cutoff, standby_horizon). + /// Leases and retain_lsns are considered in the gc-compaction job itself so we don't need to compute it + /// here. + pub(crate) fn get_gc_compaction_watermark(self: &Arc) -> Lsn { + let gc_cutoff_lsn = { + let gc_info = self.gc_info.read().unwrap(); + gc_info.min_cutoff() + }; + let watermark = gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn()); + let watermark = watermark.min(self.standby_horizon.load()); + // TODO: ensure the child branches will not use anything below the watermark, or consider + // them when computing the watermark. + watermark + } + /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job. /// The function returns a list of compaction jobs that can be executed separately. If the upper bound of the compact LSN /// range is not specified, we will use the latest gc_cutoff as the upper bound, so that all jobs in the jobset acts @@ -1810,7 +1826,7 @@ impl Timeline { let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX { job.compact_lsn_range.end } else { - *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff + self.get_gc_compaction_watermark() }; // Split compaction job to about 4GB each @@ -2005,7 +2021,7 @@ impl Timeline { // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of // cleaning everything that theoritically it could. In the future, it should use `self.gc_info` // to get the truth data. - let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn(); + let real_gc_cutoff = self.get_gc_compaction_watermark(); // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for // each of the retain_lsn. Therefore, if the user-provided `compact_lsn_range.end` is larger than the real gc cutoff, we will use // the real cutoff.