From 48b22bd057070f10fda9e4c8a7f5326441617b22 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 20 Mar 2024 15:01:14 +0000 Subject: [PATCH 1/6] walredo: better benchmark --- pageserver/benches/bench_walredo.rs | 317 +++++++++++++--------------- 1 file changed, 144 insertions(+), 173 deletions(-) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 47c8bd75c63a..72c0f2c6c080 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -1,160 +1,128 @@ -//! Simple benchmarking around walredo. +//! Quantify a single walredo manager's throughput under N concurrent callers. //! -//! Right now they hope to just set a baseline. Later we can try to expand into latency and -//! throughput after figuring out the coordinated omission problems below. +//! The benchmark implementation ([`bench_impl`]) is parametrized by +//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`] +//! - `n_redos` => number of times the benchmark shell execute the `redo_work` +//! - `nclients` => number of clients (more on this shortly). //! -//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by -//! logging what happens when a sequential scan is requested on a small table, then picking out two -//! suitable from logs. +//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters. +//! It spawns `nclients` times [`client`] tokio tasks. +//! Each task executes the `redo_work` `n_redos/nclients` times. //! +//! We exercise the following combinations: +//! - `redo_work = short / medium`` +//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]` //! -//! Reference data (git blame to see commit) on an i3en.3xlarge -// ```text -//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs] -//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs] -//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs] -//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs] -//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs] -//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs] -//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs] -//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms] -//! `` - -use std::sync::Arc; +//! We let `criterion` determine the `n_redos` using `iter_custom`. +//! The idea is that for each `(redo_work, nclients)` combination, +//! criterion will run the `bench_impl` multiple times with different `n_redos`. +//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective. +//! Criterion will divide that by `n_redos` to compute the "time per iteration". +//! In our case, "time per iteration" means "time per redo_work execution". +//! +//! NB: the way by which `iter_custom` determines the "number of iterations" +//! is called sampling. Apparently the idea here is to detect outliers. +//! We're not sure whether the current choice of sampling method makes sense. +//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples use bytes::{Buf, Bytes}; -use pageserver::{ - config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, +use criterion::{BenchmarkId, Criterion}; +use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use std::{ + sync::Arc, + time::{Duration, Instant}, }; -use pageserver_api::shard::TenantShardId; -use tokio::task::JoinSet; +use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +fn bench(c: &mut Criterion) { + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("short"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } -fn redo_scenarios(c: &mut Criterion) { - // logging should be enabled when adding more inputs, since walredo will only report malformed - // input to the stderr. - // utils::logging::init(utils::logging::LogFormat::Plain).unwrap(); + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("medium"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } +} +criterion::criterion_group!(benches, bench); +criterion::criterion_main!(benches); +// Returns the sum of each client's wall-clock time spent executing their share of the n_redos. +fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_shard_id); - - let manager = Arc::new(manager); - - { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - tracing::info!("executing first"); - rt.block_on(short().execute(&manager)).unwrap(); - tracing::info!("first executed"); - } - - let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128]; - - let mut group = c.benchmark_group("short"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("short", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, short); - }, - ); - } - drop(group); - - let mut group = c.benchmark_group("medium"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("medium", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium); - }, - ); - } - drop(group); -} - -/// Sets up a multi-threaded tokio runtime with default worker thread count, -/// then, spawn `requesters` tasks that repeatedly: -/// - get input from `input_factor()` -/// - call `manager.request_redo()` with their input -/// -/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency. -/// -/// Using tokio's default worker thread count means the results will differ on machines -/// with different core countrs. We don't care about that, the performance will always -/// be different on different hardware. To compare performance of different software versions, -/// use the same hardware. -fn add_multithreaded_walredo_requesters( - b: &mut criterion::Bencher, - nrequesters: usize, - manager: &Arc, - input_factory: fn() -> Request, -) { - assert_ne!(nrequesters, 0); - let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); - let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1)); + let start = Arc::new(Barrier::new(nclients as usize)); - let mut requesters = JoinSet::new(); - for _ in 0..nrequesters { - let _entered = rt.enter(); - let manager = manager.clone(); - let barrier = barrier.clone(); - requesters.spawn(async move { - loop { - let input = input_factory(); - barrier.wait().await; - let page = input.execute(&manager).await.unwrap(); - assert_eq!(page.remaining(), 8192); - barrier.wait().await; - } - }); - } + let mut tasks = JoinSet::new(); - let do_one_iteration = || { - rt.block_on(async { - barrier.wait().await; - // wait for work to complete - barrier.wait().await; - }) - }; + let manager = PostgresRedoManager::new(conf, tenant_shard_id); + let manager = Arc::new(manager); - b.iter_batched( - || { - // warmup - do_one_iteration(); - }, - |()| { - // work loop - do_one_iteration(); - }, - criterion::BatchSize::PerIteration, - ); + for _ in 0..nclients { + rt.block_on(async { + tasks.spawn(client( + Arc::clone(&manager), + Arc::clone(&start), + Arc::clone(&redo_work), + // divide the amount of work equally among the clients + n_redos / nclients, + )) + }); + } - rt.block_on(requesters.shutdown()); + let mut total_wallclock_time = std::time::Duration::from_millis(0); + while let Some(res) = rt.block_on(tasks.join_next()) { + total_wallclock_time += res.unwrap(); + } + total_wallclock_time } -criterion_group!(benches, redo_scenarios); -criterion_main!(benches); +async fn client( + mgr: Arc, + start: Arc, + redo_work: Arc, + n_redos: u64, +) -> Duration { + start.wait().await; + let start = Instant::now(); + for _ in 0..n_redos { + let page = redo_work.execute(&mgr).await.unwrap(); + assert_eq!(page.remaining(), 8192); + } + start.elapsed() +} macro_rules! lsn { ($input:expr) => {{ @@ -166,12 +134,46 @@ macro_rules! lsn { }}; } -/// Short payload, 1132 bytes. -// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 -// for null bytes. -#[allow(clippy::octal_escapes)] -fn short() -> Request { - Request { +/// Simple wrapper around `WalRedoManager::request_redo`. +/// +/// In benchmarks this is cloned around. +#[derive(Clone)] +struct Request { + key: Key, + lsn: Lsn, + base_img: Option<(Lsn, Bytes)>, + records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, +} + +impl Request { + async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result { + let Request { + key, + lsn, + base_img, + records, + pg_version, + } = self; + + // TODO: avoid these clones + manager + .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) + .await + } + + fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { + let rec = Bytes::from_static(bytes); + NeonWalRecord::Postgres { will_init, rec } + } + + /// Short payload, 1132 bytes. + // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 + // for null bytes. + #[allow(clippy::octal_escapes)] + pub fn short_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -194,13 +196,14 @@ fn short() -> Request { ], pg_version: 14, } -} + } -/// Medium sized payload, serializes as 26393 bytes. -// see [`short`] -#[allow(clippy::octal_escapes)] -fn medium() -> Request { - Request { + /// Medium sized payload, serializes as 26393 bytes. + // see [`short`] + #[allow(clippy::octal_escapes)] + pub fn medium_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -442,37 +445,5 @@ fn medium() -> Request { ], pg_version: 14, } -} - -fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { - let rec = Bytes::from_static(bytes); - NeonWalRecord::Postgres { will_init, rec } -} - -/// Simple wrapper around `WalRedoManager::request_redo`. -/// -/// In benchmarks this is cloned around. -#[derive(Clone)] -struct Request { - key: Key, - lsn: Lsn, - base_img: Option<(Lsn, Bytes)>, - records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, -} - -impl Request { - async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result { - let Request { - key, - lsn, - base_img, - records, - pg_version, - } = self; - - manager - .request_redo(key, lsn, base_img, records, pg_version) - .await } } From 929423cf68b7cbef1edd27c24523460d5e3ac9a8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 20 Mar 2024 18:04:14 +0000 Subject: [PATCH 2/6] add i3en.3xlarge reference numbers --- pageserver/benches/bench_walredo.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 72c0f2c6c080..0d0e940bd567 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -24,6 +24,29 @@ //! is called sampling. Apparently the idea here is to detect outliers. //! We're not sure whether the current choice of sampling method makes sense. //! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples +//! +//! # Reference Numbers +//! +//! 2024-03-20 on i3en.3xlarge +//! +//! ```text +//! short/1 time: [21.612 µs 21.707 µs 21.817 µs] +//! short/2 time: [27.216 µs 27.372 µs 27.557 µs] +//! short/4 time: [44.398 µs 45.858 µs 47.178 µs] +//! short/8 time: [81.236 µs 83.332 µs 85.419 µs] +//! short/16 time: [138.29 µs 139.76 µs 141.24 µs] +//! short/32 time: [149.67 µs 150.44 µs 151.30 µs] +//! short/64 time: [155.31 µs 155.90 µs 156.59 µs] +//! short/128 time: [156.33 µs 156.85 µs 157.44 µs] +//! medium/1 time: [105.47 µs 105.87 µs 106.36 µs] +//! medium/2 time: [157.09 µs 157.68 µs 158.40 µs] +//! medium/4 time: [293.69 µs 306.80 µs 318.11 µs] +//! medium/8 time: [594.88 µs 614.05 µs 633.18 µs] +//! medium/16 time: [848.28 µs 853.06 µs 858.68 µs] +//! medium/32 time: [916.31 µs 920.97 µs 926.27 µs] +//! medium/64 time: [939.99 µs 945.49 µs 951.30 µs] +//! medium/128 time: [918.89 µs 928.75 µs 938.41 µs] +//! ``` use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; From 15cfa7b6e9c2d72c4c8d2aa9640a14e3b35e8b82 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 21 Mar 2024 10:51:22 +0000 Subject: [PATCH 3/6] apply review suggestions --- pageserver/benches/bench_walredo.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 0d0e940bd567..68902b991bcd 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -125,11 +125,13 @@ fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration }); } - let mut total_wallclock_time = std::time::Duration::from_millis(0); - while let Some(res) = rt.block_on(tasks.join_next()) { - total_wallclock_time += res.unwrap(); - } - total_wallclock_time + rt.block_on(async move { + let mut total_wallclock_time = std::time::Duration::from_millis(0); + while let Some(res) = tasks.join_next().await { + total_wallclock_time += res.unwrap(); + } + total_wallclock_time + }) } async fn client( From db3333eecb3257e4f059f63a913d404e9c6a1e84 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 21 Mar 2024 10:51:42 +0000 Subject: [PATCH 4/6] yield after ever redo execution --- pageserver/benches/bench_walredo.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 68902b991bcd..e98457a6ff3a 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -145,6 +145,9 @@ async fn client( for _ in 0..n_redos { let page = redo_work.execute(&mgr).await.unwrap(); assert_eq!(page.remaining(), 8192); + // The real pageserver will rarely if ever do 2 walredos in a row without + // yielding to the executor. + tokio::task::yield_now().await; } start.elapsed() } From d6c45625e63e77c4b2f961045633262b841ca9e5 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 21 Mar 2024 12:30:10 +0000 Subject: [PATCH 5/6] update numbers (the yield makes a big difference, who would have thunken) --- pageserver/benches/bench_walredo.rs | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index e98457a6ff3a..dbc8197d25e8 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -30,22 +30,22 @@ //! 2024-03-20 on i3en.3xlarge //! //! ```text -//! short/1 time: [21.612 µs 21.707 µs 21.817 µs] -//! short/2 time: [27.216 µs 27.372 µs 27.557 µs] -//! short/4 time: [44.398 µs 45.858 µs 47.178 µs] -//! short/8 time: [81.236 µs 83.332 µs 85.419 µs] -//! short/16 time: [138.29 µs 139.76 µs 141.24 µs] -//! short/32 time: [149.67 µs 150.44 µs 151.30 µs] -//! short/64 time: [155.31 µs 155.90 µs 156.59 µs] -//! short/128 time: [156.33 µs 156.85 µs 157.44 µs] -//! medium/1 time: [105.47 µs 105.87 µs 106.36 µs] -//! medium/2 time: [157.09 µs 157.68 µs 158.40 µs] -//! medium/4 time: [293.69 µs 306.80 µs 318.11 µs] -//! medium/8 time: [594.88 µs 614.05 µs 633.18 µs] -//! medium/16 time: [848.28 µs 853.06 µs 858.68 µs] -//! medium/32 time: [916.31 µs 920.97 µs 926.27 µs] -//! medium/64 time: [939.99 µs 945.49 µs 951.30 µs] -//! medium/128 time: [918.89 µs 928.75 µs 938.41 µs] +//! short/1 time: [26.483 µs 26.614 µs 26.767 µs] +//! short/2 time: [32.223 µs 32.465 µs 32.767 µs] +//! short/4 time: [47.203 µs 47.583 µs 47.984 µs] +//! short/8 time: [89.135 µs 89.612 µs 90.139 µs] +//! short/16 time: [190.12 µs 191.52 µs 192.88 µs] +//! short/32 time: [380.96 µs 382.63 µs 384.20 µs] +//! short/64 time: [736.86 µs 741.07 µs 745.03 µs] +//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms] +//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs] +//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs] +//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs] +//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs] +//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms] +//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms] +//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms] +//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms] //! ``` use bytes::{Buf, Bytes}; From b2f5b84c2fdb7e33572487894f0518d2684aa12c Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 21 Mar 2024 13:00:55 +0000 Subject: [PATCH 6/6] cargo fmt --- pageserver/benches/bench_walredo.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index dbc8197d25e8..3efad546a6f0 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -126,11 +126,11 @@ fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration } rt.block_on(async move { - let mut total_wallclock_time = std::time::Duration::from_millis(0); - while let Some(res) = tasks.join_next().await { - total_wallclock_time += res.unwrap(); - } - total_wallclock_time + let mut total_wallclock_time = std::time::Duration::from_millis(0); + while let Some(res) = tasks.join_next().await { + total_wallclock_time += res.unwrap(); + } + total_wallclock_time }) }