diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 3efad546a6f0..ffe607be4b7c 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -27,25 +27,25 @@ //! //! # Reference Numbers //! -//! 2024-03-20 on i3en.3xlarge +//! 2024-04-04 on i3en.3xlarge //! //! ```text -//! short/1 time: [26.483 µs 26.614 µs 26.767 µs] -//! short/2 time: [32.223 µs 32.465 µs 32.767 µs] -//! short/4 time: [47.203 µs 47.583 µs 47.984 µs] -//! short/8 time: [89.135 µs 89.612 µs 90.139 µs] -//! short/16 time: [190.12 µs 191.52 µs 192.88 µs] -//! short/32 time: [380.96 µs 382.63 µs 384.20 µs] -//! short/64 time: [736.86 µs 741.07 µs 745.03 µs] -//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms] -//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs] -//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs] -//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs] -//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs] -//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms] -//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms] -//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms] -//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms] +//! short/1 time: [25.925 µs 26.060 µs 26.209 µs] +//! short/2 time: [31.277 µs 31.483 µs 31.722 µs] +//! short/4 time: [45.496 µs 45.831 µs 46.182 µs] +//! short/8 time: [84.298 µs 84.920 µs 85.566 µs] +//! short/16 time: [185.04 µs 186.41 µs 187.88 µs] +//! short/32 time: [385.01 µs 386.77 µs 388.70 µs] +//! short/64 time: [770.24 µs 773.04 µs 776.04 µs] +//! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms] +//! medium/1 time: [106.65 µs 107.20 µs 107.85 µs] +//! medium/2 time: [153.28 µs 154.24 µs 155.56 µs] +//! medium/4 time: [325.67 µs 327.01 µs 328.71 µs] +//! medium/8 time: [646.82 µs 650.17 µs 653.91 µs] +//! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms] +//! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms] +//! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms] +//! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms] //! ``` use bytes::{Buf, Bytes}; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 0004f4f3c968..ca41a576fd36 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -36,11 +36,12 @@ use bytes::{Bytes, BytesMut}; use pageserver_api::key::key_to_rel_block; use pageserver_api::models::WalRedoManagerStatus; use pageserver_api::shard::TenantShardId; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; use tracing::*; use utils::lsn::Lsn; +use utils::sync::heavier_once_cell; /// /// This is the real implementation that uses a Postgres process to @@ -53,7 +54,19 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - redo_process: RwLock>>, + /// The current [`process::WalRedoProcess`] that is used by new redo requests. + /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo + /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the + /// their process object; we use [`Arc::clone`] for that. + /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] + /// had that behavior; it's probably unnecessary. + /// The only merit of it is that if one walredo process encounters an error, + /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`]. + /// and retry redo, thereby starting the new process, while other redo tasks might + /// still be using the old redo process. But, those other tasks will most likely + /// encounter an error as well, and errors are an unexpected condition anyway. + /// So, probably we could get rid of the `Arc` in the future. + redo_process: heavier_once_cell::OnceCell>, } /// @@ -101,6 +114,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await }; img = Some(result?); @@ -121,6 +135,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await } } @@ -134,7 +149,7 @@ impl PostgresRedoManager { chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) }) }, - pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()), + pid: self.redo_process.get().map(|p| p.id()), }) } } @@ -152,7 +167,7 @@ impl PostgresRedoManager { tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), - redo_process: RwLock::new(None), + redo_process: heavier_once_cell::OnceCell::default(), } } @@ -164,8 +179,7 @@ impl PostgresRedoManager { if let Some(last_redo_at) = *g { if last_redo_at.elapsed() >= idle_timeout { drop(g); - let mut guard = self.redo_process.write().unwrap(); - *guard = None; + drop(self.redo_process.get().map(|guard| guard.take_and_deinit())); } } } @@ -174,8 +188,11 @@ impl PostgresRedoManager { /// /// Process one request for WAL redo using wal-redo postgres /// + /// # Cancel-Safety + /// + /// Cancellation safe. #[allow(clippy::too_many_arguments)] - fn apply_batch_postgres( + async fn apply_batch_postgres( &self, key: Key, lsn: Lsn, @@ -191,42 +208,31 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - // launch the WAL redo process on first use - let proc: Arc = { - let proc_guard = self.redo_process.read().unwrap(); - match &*proc_guard { - None => { - // "upgrade" to write lock to launch the process - drop(proc_guard); - let mut proc_guard = self.redo_process.write().unwrap(); - match &*proc_guard { - None => { - let start = Instant::now(); - let proc = Arc::new( - process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM - .observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - *proc_guard = Some(Arc::clone(&proc)); - proc - } - Some(proc) => Arc::clone(proc), - } + let proc: Arc = + match self.redo_process.get_or_init_detached().await { + Ok(guard) => Arc::clone(&guard), + Err(permit) => { + // don't hold poison_guard, the launch code can bail + let start = Instant::now(); + let proc = Arc::new( + process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + ); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process.set(Arc::clone(&proc), permit); + proc } - Some(proc) => Arc::clone(proc), - } - }; + }; let started_at = std::time::Instant::now(); @@ -272,34 +278,34 @@ impl PostgresRedoManager { n_attempts, e, ); - // Avoid concurrent callers hitting the same issue. - // We can't prevent it from happening because we want to enable parallelism. - { - let mut guard = self.redo_process.write().unwrap(); - match &*guard { - Some(current_field_value) => { - if Arc::ptr_eq(current_field_value, &proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - *guard = None; - } - } - None => { - // Another thread was faster to observe the error, and already took the process out of rotation. - } - } - } + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // // NB: there may still be other concurrent threads using `proc`. // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep - // holding the lock while waiting for the process to exit. - // NB: the drop impl blocks the current threads with a wait() system call for - // the child process. We dropped the `guard` above so that other threads aren't - // affected. But, it's good that the current thread _does_ block to wait. - // If we instead deferred the waiting into the background / to tokio, it could - // happen that if walredo always fails immediately, we spawn processes faster + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + if Arc::ptr_eq(&proc, &*guard) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. drop(proc); } else if n_attempts != 0 { info!(n_attempts, "retried walredo succeeded");