From 196c0cffffe892b07b821f363e8fbe57674bb08e Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Sun, 8 Dec 2024 15:42:37 -0500 Subject: [PATCH] fix(pageserver): fix gc-compaction racing with legacy gc Signed-off-by: Alex Chi Z --- pageserver/src/tenant.rs | 36 +++++++++++++------ pageserver/src/tenant/gc_block.rs | 12 +++---- .../tenant/remote_timeline_client/index.rs | 1 + pageserver/src/tenant/timeline/compaction.rs | 5 ++- test_runner/regress/test_compaction.py | 6 +--- 5 files changed, 38 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4a9c44aefdbc6..10703f5e602ee 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3028,14 +3028,23 @@ impl Tenant { let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); let tline_pending_tasks = guard.entry(*timeline_id).or_default(); for (idx, job) in jobs.into_iter().enumerate() { - tline_pending_tasks.push_back(ScheduledCompactionTask { - options: job, - result_tx: if idx == jobs_len - 1 { - // The last compaction job sends the completion signal - next_scheduled_compaction_task.result_tx.take() - } else { - None - }, + tline_pending_tasks.push_back(if idx == jobs_len - 1 { + ScheduledCompactionTask { + options: job, + // The last job in the queue sends the signal and release the gc guard + result_tx: next_scheduled_compaction_task + .result_tx + .take(), + gc_block: next_scheduled_compaction_task + .gc_block + .take(), + } + } else { + ScheduledCompactionTask { + options: job, + result_tx: None, + gc_block: None, + } }); } info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len); @@ -3095,15 +3104,22 @@ impl Tenant { &self, timeline_id: TimelineId, options: CompactOptions, - ) -> tokio::sync::oneshot::Receiver<()> { + ) -> anyhow::Result> { + let gc_guard = match self.gc_block.start().await { + Ok(guard) => guard, + Err(e) => { + bail!("cannot run gc-compaction because gc is blocked: {}", e); + } + }; let (tx, rx) = tokio::sync::oneshot::channel(); let mut guard = self.scheduled_compaction_tasks.lock().unwrap(); let tline_pending_tasks = guard.entry(timeline_id).or_default(); tline_pending_tasks.push_back(ScheduledCompactionTask { options, result_tx: Some(tx), + gc_block: Some(gc_guard), }); - rx + Ok(rx) } // Call through to all timelines to freeze ephemeral layers if needed. Usually diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index 373779ddb882c..d002e19c1ba6d 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::{collections::HashMap, sync::Arc}; use utils::id::TimelineId; @@ -20,7 +20,7 @@ pub(crate) struct GcBlock { /// Do not add any more features taking and forbidding taking this lock. It should be /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`] /// synchronizes with gc attempts by locking and unlocking this mutex. - blocking: tokio::sync::Mutex<()>, + blocking: Arc>, } impl GcBlock { @@ -30,7 +30,7 @@ impl GcBlock { /// it's ending, or if not currently possible, a value describing the reasons why not. /// /// Cancellation safe. - pub(super) async fn start(&self) -> Result, BlockingReasons> { + pub(super) async fn start(&self) -> Result { let reasons = { let g = self.reasons.lock().unwrap(); @@ -44,7 +44,7 @@ impl GcBlock { Err(reasons) } else { Ok(Guard { - _inner: self.blocking.lock().await, + _inner: self.blocking.clone().lock_owned().await, }) } } @@ -170,8 +170,8 @@ impl GcBlock { } } -pub(super) struct Guard<'a> { - _inner: tokio::sync::MutexGuard<'a, ()>, +pub(super) struct Guard { + _inner: tokio::sync::OwnedMutexGuard<()>, } #[derive(Debug)] diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 506990fb2fa4e..66e3e47d1b102 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -301,6 +301,7 @@ pub(crate) struct GcBlocking { pub(crate) enum GcBlockingReason { Manual, DetachAncestor, + GcCompaction, } impl GcBlocking { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a18e157d37b48..d4cd40e6de375 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -41,7 +41,7 @@ use crate::tenant::storage_layer::{ use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::{DeltaLayer, MaybeOffloaded}; +use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use pageserver_api::config::tenant_conf_defaults::{ DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, @@ -65,7 +65,10 @@ const COMPACTION_DELTA_THRESHOLD: usize = 5; /// A scheduled compaction task. pub struct ScheduledCompactionTask { pub options: CompactOptions, + /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender. pub result_tx: Option>, + /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard. + pub gc_block: Option, } pub struct GcCompactionJobDescription { diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 881503046ce3b..3473ad2008568 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -121,9 +121,6 @@ def test_pageserver_compaction_smoke( assert vectored_average < 8 -@pytest.mark.skip( - "This is being fixed and tracked in https://github.com/neondatabase/neon/issues/9114" -) @skip_in_debug_build("only run with release build") def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): SMOKE_CONF = { @@ -165,8 +162,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): "sub_compaction": True, "compact_range": { "start": "000000000000000000000000000000000000", - # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this - "end": "010000000000000000000000000000000000", + "end": "030000000000000000000000000000000000", }, }, )