remove materialized page cache (#8105)

part of Epic #7386 # Motivation The materialized page cache adds complexity to the code base, which increases the maintenance burden and risk for subtle and hard to reproduce bugs such as #8050. Further, the best hit rate that we currently achieve in production is ca 1% of materialized page cache lookups for `task_kind=PageRequestHandler`. Other task kinds have hit rates <0.2%. Last, caching page images in Pageserver rewards under-sized caches in Computes because reading from Pageserver's materialized page cache over the network is often sufficiently fast (low hundreds of microseconds). Such Computes should upscale their local caches to fit their working set, rather than repeatedly requesting the same page from Pageserver. Some more discussion and context in internal thread https://neondb.slack.com/archives/C033RQ5SPDH/p1718714037708459 # Changes This PR removes the materialized page cache code & metrics. The infrastructure for different key kinds in `PageCache` is left in place, even though the "Immutable" key kind is the only remaining one. This can be further simplified in a future commit. Some tests started failing because their total runtime was dependent on high materialized page cache hit rates. This test makes them fixed-runtime or raises pytest timeouts: * test_local_file_cache_unlink * test_physical_replication * test_pg_regress # Performance I focussed on ensuring that this PR will not result in a performance regression in prod. * **getpage** requests: our production metrics have shown the materialized page cache to be irrelevant (low hit rate). Also, Pageserver is the wrong place to cache page images, it should happen in compute. * **ingest** (`task_kind=WalReceiverConnectionHandler`): prod metrics show 0 percent hit rate, so, removing will not be a regression. * **get_lsn_by_timestamp**: important API for branch creation, used by control pane. The clog pages that this code uses are not materialize-page-cached because they're not 8k. No risk of introducing a regression here. We will watch the various nightly benchmarks closely for more results before shipping to prod.
neondatabase · Jun 27, 2024 · d8bf047 · d8bf047
1 parent 5d62c67
commit d8bf047
Show file tree

Hide file tree

Showing 9 changed files with 43 additions and 468 deletions.
diff --git a/docs/pageserver-pagecache.md b/docs/pageserver-pagecache.md
@@ -5,4 +5,3 @@ TODO:
 - shared across tenants
 - store pages from layer files
 - store pages from "in-memory layer"
-- store materialized pages
diff --git a/docs/settings.md b/docs/settings.md
@@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen.
 
 #### page_cache_size
 
-Size of the page cache, to hold materialized page versions. Unit is
+Size of the page cache. Unit is
 number of 8 kB blocks. The default is 8192, which means 64 MB.
 
 #### max_file_descriptors

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
@@ -145,14 +145,6 @@ impl ReconstructTimeMetrics {
     }
 }
 
-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_materialized_cache_hits_direct_total",
-        "Number of cache hits from materialized page cache without redo",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) struct ReconstructDataTimeMetrics {
     singular: Histogram,
     vectored: Histogram,
@@ -182,14 +174,6 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> =
     }
 });
 
-pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "pageserver_materialized_cache_hits_total",
-        "Number of cache hits from materialized page cache",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) struct GetVectoredLatency {
     map: EnumMap<TaskKind, Option<Histogram>>,
 }
@@ -298,12 +282,8 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
 });
 
 pub(crate) struct PageCacheMetricsForTaskKind {
-    pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
-
     pub read_hits_immutable: IntCounter,
-    pub read_hits_materialized_page_exact: IntCounter,
-    pub read_hits_materialized_page_older_lsn: IntCounter,
 }
 
 pub(crate) struct PageCacheMetrics {
@@ -336,16 +316,6 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
             let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
             let content_kind: &'static str = content_kind.into();
             PageCacheMetricsForTaskKind {
-                read_accesses_materialized_page: {
-                    PAGE_CACHE_READ_ACCESSES
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                        ])
-                        .unwrap()
-                },
-
                 read_accesses_immutable: {
                     PAGE_CACHE_READ_ACCESSES
                         .get_metric_with_label_values(&[task_kind, "immutable", content_kind])
@@ -357,28 +327,6 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
                         .get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
                         .unwrap()
                 },
-
-                read_hits_materialized_page_exact: {
-                    PAGE_CACHE_READ_HITS
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                            "exact",
-                        ])
-                        .unwrap()
-                },
-
-                read_hits_materialized_page_older_lsn: {
-                    PAGE_CACHE_READ_HITS
-                        .get_metric_with_label_values(&[
-                            task_kind,
-                            "materialized_page",
-                            content_kind,
-                            "older_lsn",
-                        ])
-                        .unwrap()
-                },
             }
         }))
     })),
@@ -394,7 +342,6 @@ pub(crate) struct PageCacheSizeMetrics {
     pub max_bytes: UIntGauge,
 
     pub current_bytes_immutable: UIntGauge,
-    pub current_bytes_materialized_page: UIntGauge,
 }
 
 static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -420,11 +367,6 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
                 .get_metric_with_label_values(&["immutable"])
                 .unwrap()
         },
-        current_bytes_materialized_page: {
-            PAGE_CACHE_SIZE_CURRENT_BYTES
-                .get_metric_with_label_values(&["materialized_page"])
-                .unwrap()
-        },
     });
 
 pub(crate) mod page_cache_eviction_metrics {
@@ -2918,13 +2860,11 @@ pub fn preinitialize_metrics() {
     // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
     // order:
     // - global metrics reside in a Lazy<PageserverMetrics>
-    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
+    //   - access via crate::metrics::PS_METRICS.some_metric.inc()
     // - could move the statics into TimelineMetrics::new()?
 
     // counters
     [
-        &MATERIALIZED_PAGE_CACHE_HIT,
-        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
         &UNEXPECTED_ONDEMAND_DOWNLOADS,
         &WALRECEIVER_STARTED_CONNECTIONS,
         &WALRECEIVER_BROKER_UPDATES,