From 4c4cb80186de6d319716897117eaf5c434d5180f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 9 Dec 2024 15:06:06 -0500
Subject: [PATCH] fix(pageserver): fix gc-compaction racing with legacy gc
 (#10052)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

close https://github.com/neondatabase/neon/issues/10049, close
https://github.com/neondatabase/neon/issues/10030, close
https://github.com/neondatabase/neon/issues/8861

part of https://github.com/neondatabase/neon/issues/9114

The legacy gc process calls `get_latest_gc_cutoff`, which uses a Rcu
different than the gc_info struct. In the gc_compaction_smoke test case,
the "latest" cutoff could be lower than the gc_info struct, causing
gc-compaction to collect data that could be accessed by
`latest_gc_cutoff`. Technically speaking, there's nothing wrong with
gc-compaction using gc_info without considering latest_gc_cutoff,
because gc_info is the source of truth. But anyways, let's fix it.

## Summary of changes

* gc-compaction uses `latest_gc_cutoff` instead of gc_info to determine
the gc horizon.
* if a gc-compaction is scheduled via tenant compaction iteration, it
will take the gc_block lock to avoid racing with functionalities like
detach ancestor (if it's triggered via manual compaction API without
scheduling, then it won't take the lock)

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/http/routes.rs                |  2 +-
 pageserver/src/tenant.rs                     | 96 ++++++++++++++++++--
 pageserver/src/tenant/gc_block.rs            | 12 +--
 pageserver/src/tenant/timeline/compaction.rs | 16 +++-
 test_runner/regress/test_compaction.py       | 33 +++----
 5 files changed, 121 insertions(+), 38 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0f11bbc50790..75d25d0a6ac7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2061,7 +2061,7 @@ async fn timeline_compact_handler(
             let tenant = state
                 .tenant_manager
                 .get_attached_tenant_shard(tenant_shard_id)?;
-            let rx = tenant.schedule_compaction(timeline_id, options).await;
+            let rx = tenant.schedule_compaction(timeline_id, options).await.map_err(ApiError::InternalServerError)?;
             if wait_until_scheduled_compaction_done {
                 // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
                 rx.await.ok();
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4a9c44aefdbc..e71a56ed402a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3028,14 +3028,23 @@ impl Tenant {
                                 let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
                                 let tline_pending_tasks = guard.entry(*timeline_id).or_default();
                                 for (idx, job) in jobs.into_iter().enumerate() {
-                                    tline_pending_tasks.push_back(ScheduledCompactionTask {
-                                        options: job,
-                                        result_tx: if idx == jobs_len - 1 {
-                                            // The last compaction job sends the completion signal
-                                            next_scheduled_compaction_task.result_tx.take()
-                                        } else {
-                                            None
-                                        },
+                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
+                                        ScheduledCompactionTask {
+                                            options: job,
+                                            // The last job in the queue sends the signal and releases the gc guard
+                                            result_tx: next_scheduled_compaction_task
+                                                .result_tx
+                                                .take(),
+                                            gc_block: next_scheduled_compaction_task
+                                                .gc_block
+                                                .take(),
+                                        }
+                                    } else {
+                                        ScheduledCompactionTask {
+                                            options: job,
+                                            result_tx: None,
+                                            gc_block: None,
+                                        }
                                     });
                                 }
                                 info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
@@ -3095,15 +3104,22 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         options: CompactOptions,
-    ) -> tokio::sync::oneshot::Receiver<()> {
+    ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
+        let gc_guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(e) => {
+                bail!("cannot run gc-compaction because gc is blocked: {}", e);
+            }
+        };
         let (tx, rx) = tokio::sync::oneshot::channel();
         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
         let tline_pending_tasks = guard.entry(timeline_id).or_default();
         tline_pending_tasks.push_back(ScheduledCompactionTask {
             options,
             result_tx: Some(tx),
+            gc_block: Some(gc_guard),
         });
-        rx
+        Ok(rx)
     }
 
     // Call through to all timelines to freeze ephemeral layers if needed.  Usually
@@ -8150,6 +8166,12 @@ mod tests {
             )
             .await?;
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             guard.cutoffs.time = Lsn(0x30);
@@ -8252,6 +8274,12 @@ mod tests {
 
         // increase GC horizon and compact again
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x40))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             guard.cutoffs.time = Lsn(0x40);
@@ -8632,6 +8660,12 @@ mod tests {
                 .await?
         };
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -8713,6 +8747,12 @@ mod tests {
 
         // increase GC horizon and compact again
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x40))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             guard.cutoffs.time = Lsn(0x40);
@@ -9160,6 +9200,12 @@ mod tests {
             )
             .await?;
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -9302,6 +9348,12 @@ mod tests {
 
         // increase GC horizon and compact again
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x38))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             guard.cutoffs.time = Lsn(0x38);
@@ -9397,6 +9449,12 @@ mod tests {
             )
             .await?;
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -9641,6 +9699,12 @@ mod tests {
         branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
 
         {
+            parent_tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x10))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = parent_tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -9655,6 +9719,12 @@ mod tests {
         }
 
         {
+            branch_tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x50))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = branch_tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -9984,6 +10054,12 @@ mod tests {
             .await?;
 
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
index 373779ddb882..af73acb2be86 100644
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, sync::Arc};
 
 use utils::id::TimelineId;
 
@@ -20,7 +20,7 @@ pub(crate) struct GcBlock {
     /// Do not add any more features taking and forbidding taking this lock. It should be
     /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
     /// synchronizes with gc attempts by locking and unlocking this mutex.
-    blocking: tokio::sync::Mutex<()>,
+    blocking: Arc<tokio::sync::Mutex<()>>,
 }
 
 impl GcBlock {
@@ -30,7 +30,7 @@ impl GcBlock {
     /// it's ending, or if not currently possible, a value describing the reasons why not.
     ///
     /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+    pub(super) async fn start(&self) -> Result<Guard, BlockingReasons> {
         let reasons = {
             let g = self.reasons.lock().unwrap();
 
@@ -44,7 +44,7 @@ impl GcBlock {
             Err(reasons)
         } else {
             Ok(Guard {
-                _inner: self.blocking.lock().await,
+                _inner: self.blocking.clone().lock_owned().await,
             })
         }
     }
@@ -170,8 +170,8 @@ impl GcBlock {
     }
 }
 
-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
+pub(crate) struct Guard {
+    _inner: tokio::sync::OwnedMutexGuard<()>,
 }
 
 #[derive(Debug)]
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index a18e157d37b4..fa924d23b01c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -41,7 +41,7 @@ use crate::tenant::storage_layer::{
 use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
-use crate::tenant::{DeltaLayer, MaybeOffloaded};
+use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 use pageserver_api::config::tenant_conf_defaults::{
     DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
@@ -63,9 +63,12 @@ use super::CompactionError;
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
 /// A scheduled compaction task.
-pub struct ScheduledCompactionTask {
+pub(crate) struct ScheduledCompactionTask {
     pub options: CompactOptions,
+    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
     pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
+    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
+    pub gc_block: Option<gc_block::Guard>,
 }
 
 pub struct GcCompactionJobDescription {
@@ -1768,8 +1771,7 @@ impl Timeline {
         let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
             compact_below_lsn
         } else {
-            let gc_info = self.gc_info.read().unwrap();
-            gc_info.cutoffs.select_min() // use the real gc cutoff
+            *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
         };
         let mut compact_jobs = Vec::new();
         // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
@@ -1962,7 +1964,11 @@ impl Timeline {
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
             let gc_cutoff = {
-                let real_gc_cutoff = gc_info.cutoffs.select_min();
+                // Currently, gc-compaction only kicks in after the legacy gc has updated the gc_cutoff.
+                // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
+                // cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
+                // to get the truth data.
+                let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
                 // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
                 // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
                 // the real cutoff.
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 881503046ce3..810a9723e0e4 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -121,9 +121,6 @@ def test_pageserver_compaction_smoke(
     assert vectored_average < 8
 
 
-@pytest.mark.skip(
-    "This is being fixed and tracked in https://github.com/neondatabase/neon/issues/9114"
-)
 @skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     SMOKE_CONF = {
@@ -156,20 +153,20 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
         if i % 10 == 0:
             log.info(f"Running churn round {i}/{churn_rounds} ...")
 
-        ps_http.timeline_compact(
-            tenant_id,
-            timeline_id,
-            enhanced_gc_bottom_most_compaction=True,
-            body={
-                "scheduled": True,
-                "sub_compaction": True,
-                "compact_range": {
-                    "start": "000000000000000000000000000000000000",
-                    # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this
-                    "end": "010000000000000000000000000000000000",
+            # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
+            ps_http.timeline_compact(
+                tenant_id,
+                timeline_id,
+                enhanced_gc_bottom_most_compaction=True,
+                body={
+                    "scheduled": True,
+                    "sub_compaction": True,
+                    "compact_range": {
+                        "start": "000000000000000000000000000000000000",
+                        "end": "030000000000000000000000000000000000",
+                    },
                 },
-            },
-        )
+            )
 
         workload.churn_rows(row_count, env.pageserver.id)
 
@@ -181,6 +178,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
 
+    # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    ps_http.timeline_gc(tenant_id, timeline_id, None)
+
 
 # Stripe sizes in number of pages.
 TINY_STRIPES = 16