pageserver: add layer visibility calculation (#8511)

## Problem We recently added a "visibility" state to layers, but nothing initializes it. Part of: - #8398 ## Summary of changes - Add a dependency on `range-set-blaze`, which is used as a fast incrementally updated alternative to KeySpace. We could also use this to replace the internals of KeySpaceRandomAccum if we wanted to. Writing a type that does this kind of "BtreeMap & merge overlapping entries" thing isn't super complicated, but no reason to write this ourselves when there's a third party impl available. - Add a function to layermap to calculate visibilities for each layer - Add a function to Timeline to call into layermap and then apply these visibilities to the Layer objects. - Invoke the calculation during startup, after image layer creations, and when removing branches. Branch removal and image layer creation are the two ways that a layer can go from Visible to Covered. - Add unit test & benchmark for the visibility calculation - Expose `pageserver_visible_physical_size` metric, which should always be <= `pageserver_remote_physical_size`. - This metric will feed into the /v1/utilization endpoint later: the visible size indicates how much space we would like to use on this pageserver for this tenant. - When `pageserver_visible_physical_size` is greater than `pageserver_resident_physical_size`, this is a sign that the tenant has long-idle branches, which result in layers that are visible in principle, but not used in practice. This does not keep visibility hints up to date in all cases: particularly, when creating a child timeline, any previously covered layers will not get marked Visible until they are accessed. Updates after image layer creation could be implemented as more of a special case, but this would require more new code: the existing depth calculation code doesn't maintain+yield the list of deltas that would be covered by an image layer. ## Performance This operation is done rarely (at startup and at timeline deletion), so needs to be efficient but not ultra-fast. There is a new `visibility` bench that measures runtime for a synthetic 100k layers case (`sequential`) and a real layer map (`real_map`) with ~26k layers. The benchmark shows runtimes of single digit milliseconds (on a ryzen 7950). This confirms that the runtime shouldn't be a problem at startup (as we already incur S3-level latencies there), but that it's slow enough that we definitely shouldn't call it more often than necessary, and it may be worthwhile to optimize further later (things like: when removing a branch, only bother scanning layers below the branchpoint) ``` visibility/sequential time: [4.5087 ms 4.5894 ms 4.6775 ms] change: [+2.0826% +3.9097% +5.8995%] (p = 0.00 < 0.05) Performance has regressed. Found 24 outliers among 100 measurements (24.00%) 2 (2.00%) high mild 22 (22.00%) high severe min: 0/1696070, max: 93/1C0887F0 visibility/real_map time: [7.0796 ms 7.0832 ms 7.0871 ms] change: [+0.3900% +0.4505% +0.5164%] (p = 0.00 < 0.05) Change within noise threshold. Found 4 outliers among 100 measurements (4.00%) 3 (3.00%) high mild 1 (1.00%) high severe min: 0/1696070, max: 93/1C0887F0 visibility/real_map_many_branches time: [4.5285 ms 4.5355 ms 4.5434 ms] change: [-1.0012% -0.8004% -0.5969%] (p = 0.00 < 0.05) Change within noise threshold. ```
neondatabase · Aug 1, 2024 · 1678dea · 1678dea · github-actions · Aug 1, 2024
1 parent 163f2ea
commit 1678dea
Show file tree

Hide file tree

Showing 15 changed files with 725 additions and 76 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
@@ -49,6 +49,7 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
+range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
@@ -1,3 +1,4 @@
+use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
@@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId};
 
 use utils::lsn::Lsn;
 
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
+
+fn fixture_path(relative: &str) -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
+}
 
 fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
     let mut layer_map = LayerMap::default();
@@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
 // between each test run.
 fn bench_from_captest_env(c: &mut Criterion) {
     // TODO consider compressing this file
-    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
     // Test with uniform query pattern
@@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
 fn bench_from_real_project(c: &mut Criterion) {
     // Init layer map
     let now = Instant::now();
-    let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
     println!("Finished layer map init in {:?}", now.elapsed());
 
     // Choose uniformly distributed queries
@@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) {
     group.finish();
 }
 
+fn bench_visibility_with_map(
+    group: &mut BenchmarkGroup<WallTime>,
+    layer_map: LayerMap,
+    read_points: Vec<Lsn>,
+    bench_name: &str,
+) {
+    group.bench_function(bench_name, |b| {
+        b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
+    });
+}
+
+// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
+fn bench_visibility(c: &mut Criterion) {
+    let mut group = c.benchmark_group("visibility");
+    {
+        // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
+        let now = Instant::now();
+        let mut layer_map = LayerMap::default();
+        let mut updates = layer_map.batch_update();
+        for i in 0..100_000 {
+            let i32 = (i as u32) % 100;
+            let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
+            let layer = PersistentLayerDesc::new_img(
+                TenantShardId::unsharded(TenantId::generate()),
+                TimelineId::generate(),
+                zero.add(10 * i32)..zero.add(10 * i32 + 1),
+                Lsn(i),
+                0,
+            );
+            updates.insert_historic(layer);
+        }
+        updates.flush();
+        println!("Finished layer map init in {:?}", now.elapsed());
+
+        let mut read_points = Vec::new();
+        for i in (0..100_000).step_by(1000) {
+            read_points.push(Lsn(i));
+        }
+
+        bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
+    }
+
+    {
+        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+        let read_points = vec![Lsn(0x1C760FA190)];
+        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
+
+        let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
+        let read_points = vec![
+            Lsn(0x1C760FA190),
+            Lsn(0x000000931BEAD539),
+            Lsn(0x000000931BF63011),
+            Lsn(0x000000931B33AE68),
+            Lsn(0x00000038E67ABFA0),
+            Lsn(0x000000931B33AE68),
+            Lsn(0x000000914E3F38F0),
+            Lsn(0x000000931B33AE68),
+        ];
+        bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
+    }
+
+    group.finish();
+}
+
 criterion_group!(group_1, bench_from_captest_env);
 criterion_group!(group_2, bench_from_real_project);
 criterion_group!(group_3, bench_sequential);
-criterion_main!(group_1, group_2, group_3);
+criterion_group!(group_4, bench_visibility);
+criterion_main!(group_1, group_2, group_3, group_4);
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
@@ -525,6 +525,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_visible_physical_size",
+        "The size of the layer files present in the pageserver's filesystem.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
     register_uint_gauge!(
         "pageserver_resident_physical_size_global",
@@ -2204,6 +2213,7 @@ pub(crate) struct TimelineMetrics {
     pub(crate) layer_count_delta: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
+    pub visible_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub aux_file_size_gauge: IntGauge,
@@ -2326,6 +2336,9 @@ impl TimelineMetrics {
         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         // TODO: we shouldn't expose this metric
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2380,6 +2393,7 @@ impl TimelineMetrics {
             layer_count_delta,
             standby_horizon_gauge,
             resident_physical_size_gauge,
+            visible_physical_size_gauge,
             current_logical_size_gauge,
             aux_file_size_gauge,
             directory_entries_count_gauge,
@@ -2431,6 +2445,7 @@ impl TimelineMetrics {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
             let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
@@ -1634,7 +1634,7 @@ impl Tenant {
         self: Arc<Self>,
         timeline_id: TimelineId,
     ) -> Result<(), DeleteTimelineError> {
-        DeleteTimelineFlow::run(&self, timeline_id, false).await?;
+        DeleteTimelineFlow::run(&self, timeline_id).await?;
 
         Ok(())
     }