From 728b79b9ddc9987315e56085b12c5a4c46a7e0f4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 00:12:13 +0300
Subject: [PATCH 01/77] Remove some unnecessary derives

---
 libs/consumption_metrics/src/lib.rs    | 4 ++--
 libs/pageserver_api/src/models.rs      | 2 +-
 libs/postgres_ffi/wal_craft/src/lib.rs | 1 -
 safekeeper/src/pull_timeline.rs        | 4 ++--
 4 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index 810196aff610..fbe2e6830f98 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
 
-#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
     #[serde(rename = "absolute")]
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;
 
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(serde::Serialize, Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
     pub events: std::borrow::Cow<'a, [T]>,
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 45e84baa1f5d..c9be53f0b0c0 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -495,7 +495,7 @@ pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
 }
 
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
     #[serde(rename_all = "snake_case")]
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 949e3f42511b..ddaafe65f170 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -26,7 +26,6 @@ macro_rules! xlog_utils_test {
 
 postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
 
-#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Conf {
     pub pg_version: u32,
     pub pg_distrib_dir: PathBuf,
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 64585f5edcc0..c772ae6de7fd 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -278,7 +278,7 @@ impl WalResidentTimeline {
 }
 
 /// pull_timeline request body.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Deserialize)]
 pub struct Request {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
@@ -293,7 +293,7 @@ pub struct Response {
 }
 
 /// Response for debug dump request.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Deserialize)]
 pub struct DebugDumpResponse {
     pub start_time: DateTime<Utc>,
     pub finish_time: DateTime<Utc>,

From 15ae1fc3df41ba9f1e5093eaf6b8e7b91e32700f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:38:26 +0300
Subject: [PATCH 02/77] Remove a few postgres constants that were not used

Dead code is generally useless, but with Postgres constants in
particular, I'm also worried that if they're not used anywhere, we
might fail to update them at a Postgres version update, and get very
confused later when they have wrong values.
---
 libs/postgres_ffi/src/pg_constants.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 61b49a634dd4..497d011d7a20 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -9,8 +9,8 @@
 //! comments on them.
 //!
 
+use crate::PageHeaderData;
 use crate::BLCKSZ;
-use crate::{PageHeaderData, XLogRecord};
 
 //
 // From pg_tablespace_d.h
@@ -194,8 +194,6 @@ pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
 pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
 pub const XLOG_TBLSPC_DROP: u8 = 0x10;
 
-pub const SIZEOF_XLOGRECORD: u32 = size_of::<XLogRecord>() as u32;
-
 //
 // from xlogrecord.h
 //
@@ -219,8 +217,6 @@ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
 /* From transam.h */
 pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
 pub const INVALID_TRANSACTION_ID: u32 = 0;
-pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
-pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
 
 /* pg_control.h */
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;

From 7b34c2d7af5c67f413cc93b16ca6dbe6932072f2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:39:28 +0300
Subject: [PATCH 03/77] Remove misc dead code in libs/

---
 libs/postgres_backend/src/lib.rs | 10 ----------
 libs/remote_storage/src/lib.rs   |  4 ----
 libs/utils/src/vec_map.rs        | 26 --------------------------
 3 files changed, 40 deletions(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 8ea4b93fb12e..e274d2458573 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -280,16 +280,6 @@ pub struct PostgresBackend<IO> {
 
 pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;
 
-pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
-    let mut query_string = query_string.to_vec();
-    if let Some(ch) = query_string.last() {
-        if *ch == 0 {
-            query_string.pop();
-        }
-    }
-    query_string
-}
-
 /// Cast a byte slice to a string slice, dropping null terminator if there's one.
 fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
     let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index b5b69c9fafed..45267ccda992 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -127,10 +127,6 @@ impl RemotePath {
         &self.0
     }
 
-    pub fn extension(&self) -> Option<&str> {
-        self.0.extension()
-    }
-
     pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
         self.0.strip_prefix(&p.0)
     }
diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs
index 5f0028bacd49..1fe048c6f00f 100644
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -120,32 +120,6 @@ impl<K: Ord, V> VecMap<K, V> {
         Ok((None, delta_size))
     }
 
-    /// Split the map into two.
-    ///
-    /// The left map contains everything before `cutoff` (exclusive).
-    /// Right map contains `cutoff` and everything after (inclusive).
-    pub fn split_at(&self, cutoff: &K) -> (Self, Self)
-    where
-        K: Clone,
-        V: Clone,
-    {
-        let split_idx = self
-            .data
-            .binary_search_by_key(&cutoff, extract_key)
-            .unwrap_or_else(std::convert::identity);
-
-        (
-            VecMap {
-                data: self.data[..split_idx].to_vec(),
-                ordering: self.ordering,
-            },
-            VecMap {
-                data: self.data[split_idx..].to_vec(),
-                ordering: self.ordering,
-            },
-        )
-    }
-
     /// Move items from `other` to the end of `self`, leaving `other` empty.
     /// If the `other` ordering is different from `self` ordering
     /// `ExtendOrderingError` error will be returned.

From 5da2340e740c96ac3f0da110c52c62f184a5c92c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:05:19 +0300
Subject: [PATCH 04/77] Remove misc dead code in control_plane/

---
 control_plane/src/pageserver.rs | 43 +--------------------------------
 1 file changed, 1 insertion(+), 42 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 33ca70af96c5..cae9416af6c2 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,9 +17,7 @@ use std::time::Duration;
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{
-    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
-};
+use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -324,22 +322,6 @@ impl PageServerNode {
         background_process::stop_process(immediate, "pageserver", &self.pid_file())
     }
 
-    pub async fn page_server_psql_client(
-        &self,
-    ) -> anyhow::Result<(
-        tokio_postgres::Client,
-        tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
-    )> {
-        let mut config = self.pg_connection_config.clone();
-        if self.conf.pg_auth_type == AuthType::NeonJWT {
-            let token = self
-                .env
-                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-            config = config.set_password(Some(token));
-        }
-        Ok(config.connect_no_tls().await?)
-    }
-
     pub async fn check_status(&self) -> mgmt_api::Result<()> {
         self.http_client.status().await
     }
@@ -540,19 +522,6 @@ impl PageServerNode {
         Ok(())
     }
 
-    pub async fn location_config(
-        &self,
-        tenant_shard_id: TenantShardId,
-        config: LocationConfig,
-        flush_ms: Option<Duration>,
-        lazy: bool,
-    ) -> anyhow::Result<()> {
-        Ok(self
-            .http_client
-            .location_config(tenant_shard_id, config, flush_ms, lazy)
-            .await?)
-    }
-
     pub async fn timeline_list(
         &self,
         tenant_shard_id: &TenantShardId,
@@ -636,14 +605,4 @@ impl PageServerNode {
 
         Ok(())
     }
-
-    pub async fn tenant_synthetic_size(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> anyhow::Result<TenantHistorySize> {
-        Ok(self
-            .http_client
-            .tenant_synthetic_size(tenant_shard_id)
-            .await?)
-    }
 }

From 2d4e5af18be812523e24133656523bb3e8ee9ecb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:04:57 +0300
Subject: [PATCH 05/77] Remove unused code for parsing a postgresql.conf file

---
 control_plane/src/postgresql_conf.rs | 121 +--------------------------
 1 file changed, 1 insertion(+), 120 deletions(-)

diff --git a/control_plane/src/postgresql_conf.rs b/control_plane/src/postgresql_conf.rs
index 638575eb823b..5aee12dc974a 100644
--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -4,13 +4,10 @@
 /// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
 /// enough to extract a few settings we need in Neon, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
-use anyhow::{bail, Context, Result};
 use once_cell::sync::Lazy;
 use regex::Regex;
 use std::collections::HashMap;
 use std::fmt;
-use std::io::BufRead;
-use std::str::FromStr;
 
 /// In-memory representation of a postgresql.conf file
 #[derive(Default, Debug)]
@@ -19,84 +16,16 @@ pub struct PostgresConf {
     hash: HashMap<String, String>,
 }
 
-static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
-
 impl PostgresConf {
     pub fn new() -> PostgresConf {
         PostgresConf::default()
     }
 
-    /// Read file into memory
-    pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
-        let mut result = Self::new();
-
-        for line in std::io::BufReader::new(read).lines() {
-            let line = line?;
-
-            // Store each line in a vector, in original format
-            result.lines.push(line.clone());
-
-            // Also parse each line and insert key=value lines into a hash map.
-            //
-            // FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
-            // But it's close enough for our usage.
-            let line = line.trim();
-            if line.starts_with('#') {
-                // comment, ignore
-                continue;
-            } else if let Some(caps) = CONF_LINE_RE.captures(line) {
-                let name = caps.get(1).unwrap().as_str();
-                let raw_val = caps.get(2).unwrap().as_str();
-
-                if let Ok(val) = deescape_str(raw_val) {
-                    // Note: if there's already an entry in the hash map for
-                    // this key, this will replace it. That's the behavior what
-                    // we want; when PostgreSQL reads the file, each line
-                    // overrides any previous value for the same setting.
-                    result.hash.insert(name.to_string(), val.to_string());
-                }
-            }
-        }
-        Ok(result)
-    }
-
     /// Return the current value of 'option'
     pub fn get(&self, option: &str) -> Option<&str> {
         self.hash.get(option).map(|x| x.as_ref())
     }
 
-    /// Return the current value of a field, parsed to the right datatype.
-    ///
-    /// This calls the FromStr::parse() function on the value of the field. If
-    /// the field does not exist, or parsing fails, returns an error.
-    ///
-    pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
-    where
-        T: FromStr,
-        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
-    {
-        self.get(field_name)
-            .with_context(|| format!("could not find '{}' option {}", field_name, context))?
-            .parse::<T>()
-            .with_context(|| format!("could not parse '{}' option {}", field_name, context))
-    }
-
-    pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
-    where
-        T: FromStr,
-        <T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
-    {
-        if let Some(val) = self.get(field_name) {
-            let result = val
-                .parse::<T>()
-                .with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
-
-            Ok(Some(result))
-        } else {
-            Ok(None)
-        }
-    }
-
     ///
     /// Note: if you call this multiple times for the same option, the config
     /// file will a line for each call. It would be nice to have a function
@@ -154,48 +83,8 @@ fn escape_str(s: &str) -> String {
     }
 }
 
-/// De-escape a possibly-quoted value.
-///
-/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
-/// does this.
-fn deescape_str(s: &str) -> Result<String> {
-    // If the string has a quote at the beginning and end, strip them out.
-    if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
-        let mut result = String::new();
-
-        let mut iter = s[1..(s.len() - 1)].chars().peekable();
-        while let Some(c) = iter.next() {
-            let newc = if c == '\\' {
-                match iter.next() {
-                    Some('b') => '\x08',
-                    Some('f') => '\x0c',
-                    Some('n') => '\n',
-                    Some('r') => '\r',
-                    Some('t') => '\t',
-                    Some('0'..='7') => {
-                        // TODO
-                        bail!("octal escapes not supported");
-                    }
-                    Some(n) => n,
-                    None => break,
-                }
-            } else if c == '\'' && iter.peek() == Some(&'\'') {
-                // doubled quote becomes just one quote
-                iter.next().unwrap()
-            } else {
-                c
-            };
-
-            result.push(newc);
-        }
-        Ok(result)
-    } else {
-        Ok(s.to_string())
-    }
-}
-
 #[test]
-fn test_postgresql_conf_escapes() -> Result<()> {
+fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
     assert_eq!(escape_str("foo bar"), "'foo bar'");
     // these don't need to be quoted
     assert_eq!(escape_str("foo"), "foo");
@@ -214,13 +103,5 @@ fn test_postgresql_conf_escapes() -> Result<()> {
     assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
     assert_eq!(escape_str("10 cats"), "'10 cats'");
 
-    // Test de-escaping
-    assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
-    assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
-    assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
-
-    // octal-escapes are currently not supported
-    assert!(deescape_str("'foo\\7\\07\\007'").is_err());
-
     Ok(())
 }

From a523548ed1791c683efe76cf5a2f42443846e358 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:16:15 +0300
Subject: [PATCH 06/77] Remove unused cleanup_remaining_timeline_fs_traces
 function

There's some more code that still checks for uninit and delete
markers, see callers of is_delete_mark and is_uninit_mark, and github
issue #5718. But these functions were outright dead.
---
 pageserver/src/config.rs                 | 14 +---------
 pageserver/src/tenant/timeline/delete.rs | 35 +-----------------------
 2 files changed, 2 insertions(+), 47 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index e9f197ec2dc9..525d9afebc6d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -13,7 +13,6 @@ use pageserver_api::{
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use storage_broker::Uri;
-use utils::crashsafe::path_with_suffix_extension;
 use utils::logging::SecretString;
 
 use once_cell::sync::OnceCell;
@@ -33,7 +32,7 @@ use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::virtual_file;
 use crate::virtual_file::io_engine;
-use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
+use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME};
 
 /// Global state of pageserver.
 ///
@@ -257,17 +256,6 @@ impl PageServerConf {
             .join(timeline_id.to_string())
     }
 
-    pub(crate) fn timeline_delete_mark_file_path(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-    ) -> Utf8PathBuf {
-        path_with_suffix_extension(
-            self.timeline_path(&tenant_shard_id, &timeline_id),
-            TIMELINE_DELETE_MARK_SUFFIX,
-        )
-    }
-
     /// Turns storage remote path of a file into its local path.
     pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
         remote_path.with_base(&self.workdir)
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index dc4118bb4a68..90db08ea819c 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -135,25 +135,6 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
         .context("delete_all")
 }
 
-// This function removs remaining traces of a timeline on disk.
-// Namely: metadata file, timeline directory, delete mark.
-// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
-// delete mark should be present because it is the last step during deletion.
-// (nothing can fail after its deletion)
-async fn cleanup_remaining_timeline_fs_traces(
-    conf: &PageServerConf,
-    tenant_shard_id: TenantShardId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<()> {
-    // Remove delete mark
-    // TODO: once we are confident that no more exist in the field, remove this
-    // line.  It cleans up a legacy marker file that might in rare cases be present.
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove delete mark")
-}
-
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_timeline_from_tenant(
@@ -194,12 +175,10 @@ async fn remove_timeline_from_tenant(
 /// 7. Delete mark file
 ///
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
+/// There are two entrypoints to the process:
 /// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
 /// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
 ///    and we possibly neeed to continue deletion of remote files.
-/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-///    index but still have local metadata, timeline directory and delete mark.
 ///
 /// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
@@ -311,18 +290,6 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
-    #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn cleanup_remaining_timeline_fs_traces(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<()> {
-        let r =
-            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
-                .await;
-        info!("Done");
-        r
-    }
-
     fn prepare(
         tenant: &Tenant,
         timeline_id: TimelineId,

From 2753abc0d88cad4ac0e9d96f95fddb7515ff7204 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 01:10:09 +0300
Subject: [PATCH 07/77] Remove leftover enums for configuring vectored get
 implementation

The settings were removed in commit corb9d2c7b.
---
 libs/pageserver_api/src/config.rs | 34 -------------------------------
 1 file changed, 34 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 1194ee93efea..fa6f594ea58e 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -173,40 +173,6 @@ impl Default for EvictionOrder {
     }
 }
 
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetVectoredImpl {
-    Sequential,
-    Vectored,
-}
-
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetImpl {
-    Legacy,
-    Vectored,
-}
-
 #[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);

From 5c68e6a1724361824e64dc84b9523809a1581e7b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:43:06 +0300
Subject: [PATCH 08/77] Remove unused constant

The code that used it was removed in commit b9d2c7bdd5
---
 libs/pageserver_api/src/config.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index fa6f594ea58e..425e710372cb 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -304,8 +304,6 @@ pub mod defaults {
     pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
         ImageCompressionAlgorithm::Zstd { level: Some(1) };
 
-    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
-
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
     pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;

From 06d55a3b12b551902d9a1484459a2da71082a0ca Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 19 Sep 2024 00:38:42 +0300
Subject: [PATCH 09/77] Clean up concurrent logical size calc semaphore
 initialization

The DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES constant was
unused, because we had just hardcoded it to 1 where the constant
should've been used.

Remove the ConfigurableSemaphore::Default implementation, since it was
unused.
---
 libs/pageserver_api/src/config.rs |  5 ++++-
 pageserver/src/config.rs          | 11 -----------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 425e710372cb..1eb0757592a6 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -340,7 +340,10 @@ impl Default for ConfigToml {
 
             concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP)
                 .expect("Invalid default constant")),
-            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(),
+            concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(
+                DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES,
+            )
+            .unwrap(),
             metric_collection_interval: (humantime::parse_duration(
                 DEFAULT_METRIC_COLLECTION_INTERVAL,
             )
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 525d9afebc6d..8567c6aa52f1 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -479,11 +479,6 @@ pub struct ConfigurableSemaphore {
 }
 
 impl ConfigurableSemaphore {
-    pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
-        Some(x) => x,
-        None => panic!("const unwrap is not yet stable"),
-    };
-
     /// Initializse using a non-zero amount of permits.
     ///
     /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
@@ -504,12 +499,6 @@ impl ConfigurableSemaphore {
     }
 }
 
-impl Default for ConfigurableSemaphore {
-    fn default() -> Self {
-        Self::new(Self::DEFAULT_INITIAL)
-    }
-}
-
 impl PartialEq for ConfigurableSemaphore {
     fn eq(&self, other: &Self) -> bool {
         // the number of permits can be increased at runtime, so we cannot really fulfill the

From 7c489092b7c63c0b3e42597b976522face0baaf8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Sep 2024 23:46:09 +0300
Subject: [PATCH 10/77] Remove unused duplicate DEFAULT_INGEST_BATCH_SIZE
 constant

This constant in 'tenant_conf_defaults' was unused, but there's
another constant with the same name in the global 'defaults'. I wish
the setting was configurable per-tenant, but it isn't, so let's remove
the confusing duplicate.
---
 libs/pageserver_api/src/config.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 1eb0757592a6..61e32bc9ab67 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -434,8 +434,6 @@ pub mod tenant_conf_defaults {
     // By default ingest enough WAL for two new L0 layers before checking if new image
     // image layers should be created.
     pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
 
 impl Default for TenantConfigToml {

From 32a0e759bd57f40af6c168f3122a761fb154b5b1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 13 Sep 2024 01:28:12 +0300
Subject: [PATCH 11/77] safekeeper: add wal_last_modified to debug_dump.

Adds to debug_dump option to include highest modified time among all WAL
segments. In passing replace some str with OsStr to have less unwraps.
---
 libs/postgres_ffi/src/xlog_utils.rs           | 34 ++++++++++++++-----
 libs/postgres_ffi/wal_craft/src/lib.rs        |  5 +--
 .../wal_craft/src/xlog_utils_test.rs          | 19 ++++++-----
 safekeeper/src/debug_dump.rs                  | 33 ++++++++++++++++++
 safekeeper/src/http/routes.rs                 |  4 +++
 safekeeper/src/wal_storage.rs                 | 25 ++++++--------
 test_runner/regress/test_wal_acceptor.py      |  1 +
 7 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 1873734753f3..a636bd2a97ef 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -26,6 +26,7 @@ use bytes::{Buf, Bytes};
 use log::*;
 
 use serde::Serialize;
+use std::ffi::OsStr;
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
@@ -78,19 +79,34 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
     )
 }
 
-pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
-    let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
-    let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
-    let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
-    (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
+pub fn XLogFromFileName(
+    fname: &OsStr,
+    wal_seg_size: usize,
+) -> anyhow::Result<(XLogSegNo, TimeLineID)> {
+    if let Some(fname_str) = fname.to_str() {
+        let tli = u32::from_str_radix(&fname_str[0..8], 16)?;
+        let log = u32::from_str_radix(&fname_str[8..16], 16)? as XLogSegNo;
+        let seg = u32::from_str_radix(&fname_str[16..24], 16)? as XLogSegNo;
+        Ok((log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli))
+    } else {
+        anyhow::bail!("non-ut8 filename: {:?}", fname);
+    }
 }
 
-pub fn IsXLogFileName(fname: &str) -> bool {
-    return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
+pub fn IsXLogFileName(fname: &OsStr) -> bool {
+    if let Some(fname) = fname.to_str() {
+        fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit())
+    } else {
+        false
+    }
 }
 
-pub fn IsPartialXLogFileName(fname: &str) -> bool {
-    fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
+pub fn IsPartialXLogFileName(fname: &OsStr) -> bool {
+    if let Some(fname) = fname.to_str() {
+        fname.ends_with(".partial") && IsXLogFileName(OsStr::new(&fname[0..fname.len() - 8]))
+    } else {
+        false
+    }
 }
 
 /// If LSN points to the beginning of the page, then shift it to first record,
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index ddaafe65f170..5c0abda52274 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -7,6 +7,7 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{
     XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };
+use std::ffi::OsStr;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -135,8 +136,8 @@ impl Conf {
 
     pub fn pg_waldump(
         &self,
-        first_segment_name: &str,
-        last_segment_name: &str,
+        first_segment_name: &OsStr,
+        last_segment_name: &OsStr,
     ) -> anyhow::Result<std::process::Output> {
         let first_segment_file = self.datadir.join(first_segment_name);
         let last_segment_file = self.datadir.join(last_segment_name);
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
index 79d45de67ab4..9eb3f0e95abf 100644
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -4,6 +4,7 @@ use super::*;
 use crate::{error, info};
 use regex::Regex;
 use std::cmp::min;
+use std::ffi::OsStr;
 use std::fs::{self, File};
 use std::io::Write;
 use std::{env, str::FromStr};
@@ -54,7 +55,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
         .wal_dir()
         .read_dir()
         .unwrap()
-        .map(|f| f.unwrap().file_name().into_string().unwrap())
+        .map(|f| f.unwrap().file_name())
         .filter(|fname| IsXLogFileName(fname))
         .max()
         .unwrap();
@@ -70,11 +71,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
             start_lsn
         );
         for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
-            let fname = file.file_name().into_string().unwrap();
+            let fname = file.file_name();
             if !IsXLogFileName(&fname) {
                 continue;
             }
-            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
+            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE).unwrap();
             let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
             if seg_start_lsn > u64::from(*start_lsn) {
                 continue;
@@ -93,10 +94,10 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
     }
 }
 
-fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
+fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &OsStr) -> Lsn {
     // Get the actual end of WAL by pg_waldump
     let waldump_output = cfg
-        .pg_waldump("000000010000000000000001", last_segment)
+        .pg_waldump(OsStr::new("000000010000000000000001"), last_segment)
         .unwrap()
         .stderr;
     let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
@@ -117,7 +118,7 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
 
 fn check_end_of_wal(
     cfg: &crate::Conf,
-    last_segment: &str,
+    last_segment: &OsStr,
     start_lsn: Lsn,
     expected_end_of_wal: Lsn,
 ) {
@@ -132,7 +133,8 @@ fn check_end_of_wal(
     // Rename file to partial to actually find last valid lsn, then rename it back.
     fs::rename(
         cfg.wal_dir().join(last_segment),
-        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        cfg.wal_dir()
+            .join(format!("{}.partial", last_segment.to_str().unwrap())),
     )
     .unwrap();
     let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
@@ -142,7 +144,8 @@ fn check_end_of_wal(
     );
     assert_eq!(wal_end, expected_end_of_wal);
     fs::rename(
-        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        cfg.wal_dir()
+            .join(format!("{}.partial", last_segment.to_str().unwrap())),
         cfg.wal_dir().join(last_segment),
     )
     .unwrap();
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index 15b0272cd942..589536c7a861 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -17,6 +17,7 @@ use postgres_ffi::MAX_SEND_SIZE;
 use serde::Deserialize;
 use serde::Serialize;
 
+use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName};
 use sha2::{Digest, Sha256};
 use utils::id::NodeId;
 use utils::id::TenantTimelineId;
@@ -51,6 +52,9 @@ pub struct Args {
     /// Dump full term history. True by default.
     pub dump_term_history: bool,
 
+    /// Dump last modified time of WAL segments. Uses value of `dump_all` by default.
+    pub dump_wal_last_modified: bool,
+
     /// Filter timelines by tenant_id.
     pub tenant_id: Option<TenantId>,
 
@@ -128,12 +132,19 @@ async fn build_from_tli_dump(
         None
     };
 
+    let wal_last_modified = if args.dump_wal_last_modified {
+        get_wal_last_modified(timeline_dir).ok().flatten()
+    } else {
+        None
+    };
+
     Timeline {
         tenant_id: timeline.ttid.tenant_id,
         timeline_id: timeline.ttid.timeline_id,
         control_file,
         memory,
         disk_content,
+        wal_last_modified,
     }
 }
 
@@ -156,6 +167,7 @@ pub struct Timeline {
     pub control_file: Option<TimelinePersistentState>,
     pub memory: Option<Memory>,
     pub disk_content: Option<DiskContent>,
+    pub wal_last_modified: Option<DateTime<Utc>>,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -302,6 +314,27 @@ fn build_file_info(entry: DirEntry) -> Result<FileInfo> {
     })
 }
 
+/// Get highest modified time of WAL segments in the directory.
+fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
+    let mut res = None;
+    for entry in fs::read_dir(path)? {
+        if entry.is_err() {
+            continue;
+        }
+        let entry = entry?;
+        /* Ignore files that are not XLOG segments */
+        let fname = entry.file_name();
+        if !IsXLogFileName(&fname) && !IsPartialXLogFileName(&fname) {
+            continue;
+        }
+
+        let metadata = entry.metadata()?;
+        let modified: DateTime<Utc> = DateTime::from(metadata.modified()?);
+        res = std::cmp::max(res, Some(modified));
+    }
+    Ok(res)
+}
+
 /// Converts SafeKeeperConf to Config, filtering out the fields that are not
 /// supposed to be exposed.
 fn build_config(config: SafeKeeperConf) -> Config {
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index e482edea55a2..b4590fe3e5d6 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -481,6 +481,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     let mut dump_memory: Option<bool> = None;
     let mut dump_disk_content: Option<bool> = None;
     let mut dump_term_history: Option<bool> = None;
+    let mut dump_wal_last_modified: Option<bool> = None;
     let mut tenant_id: Option<TenantId> = None;
     let mut timeline_id: Option<TimelineId> = None;
 
@@ -494,6 +495,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
             "dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?),
             "dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?),
             "dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?),
+            "dump_wal_last_modified" => dump_wal_last_modified = Some(parse_kv_str(&k, &v)?),
             "tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?),
             "timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?),
             _ => Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -508,6 +510,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     let dump_memory = dump_memory.unwrap_or(dump_all);
     let dump_disk_content = dump_disk_content.unwrap_or(dump_all);
     let dump_term_history = dump_term_history.unwrap_or(true);
+    let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);
 
     let args = debug_dump::Args {
         dump_all,
@@ -515,6 +518,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
         dump_memory,
         dump_disk_content,
         dump_term_history,
+        dump_wal_last_modified,
         tenant_id,
         timeline_id,
     };
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 46c260901d59..6e7da9497309 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -539,20 +539,17 @@ async fn remove_segments_from_disk(
     while let Some(entry) = entries.next_entry().await? {
         let entry_path = entry.path();
         let fname = entry_path.file_name().unwrap();
-
-        if let Some(fname_str) = fname.to_str() {
-            /* Ignore files that are not XLOG segments */
-            if !IsXLogFileName(fname_str) && !IsPartialXLogFileName(fname_str) {
-                continue;
-            }
-            let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
-            if remove_predicate(segno) {
-                remove_file(entry_path).await?;
-                n_removed += 1;
-                min_removed = min(min_removed, segno);
-                max_removed = max(max_removed, segno);
-                REMOVED_WAL_SEGMENTS.inc();
-            }
+        /* Ignore files that are not XLOG segments */
+        if !IsXLogFileName(fname) && !IsPartialXLogFileName(fname) {
+            continue;
+        }
+        let (segno, _) = XLogFromFileName(fname, wal_seg_size)?;
+        if remove_predicate(segno) {
+            remove_file(entry_path).await?;
+            n_removed += 1;
+            min_removed = min(min_removed, segno);
+            max_removed = max(max_removed, segno);
+            REMOVED_WAL_SEGMENTS.inc();
         }
     }
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 4bf8cfe88f66..8ee548bdb031 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -892,6 +892,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     log.info(f"debug_dump before reboot {debug_dump_0}")
     assert debug_dump_0["timelines_count"] == 1
     assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id)
+    assert debug_dump_0["timelines"][0]["wal_last_modified"] != ""
 
     endpoint.safe_psql("create table t(i int)")
 

From 21eeafaaa58326bea8411b5b28db58fa0755e47e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 19 Sep 2024 14:51:00 +0100
Subject: [PATCH 12/77] pageserver: simple fix for vectored read image layer
 skip (#9026)

## Problem

Different keyspaces may require different floor LSNs in vectored
delta layer visits. This patch adds support for such cases.

## Summary of changes

Different keyspaces wishing to read the same layer might
require different stop lsns (or lsn floor). The start LSN
of the read (or the lsn ceil) will always be the same.

With this observation, we fix skipping of image layers by
indexing the fringe by layer id plus lsn floor.

This is very simple, but means that we can visit delta layers twice
in certain cases. Still, I think it's very unlikely for any extra
merging to have taken place in this case, so perhaps it makes sense to go
with the simpler patch.

Fixes https://github.com/neondatabase/neon/issues/9012
Alternative to https://github.com/neondatabase/neon/pull/9025
---
 pageserver/src/tenant.rs               | 146 ++++++++++++++++++++++++-
 pageserver/src/tenant/storage_layer.rs |  52 +++++----
 2 files changed, 176 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c6f0e481017e..14cb6f508db2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4164,9 +4164,18 @@ pub(crate) mod harness {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
             if records_neon {
                 // For Neon wal records, we can decode without spawning postgres, so do so.
-                let base_img = base_img.expect("Neon WAL redo requires base image").1;
-                let mut page = BytesMut::new();
-                page.extend_from_slice(&base_img);
+                let mut page = match (base_img, records.first()) {
+                    (Some((_lsn, img)), _) => {
+                        let mut page = BytesMut::new();
+                        page.extend_from_slice(&img);
+                        page
+                    }
+                    (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
+                    _ => {
+                        panic!("Neon WAL redo requires base image or will init record");
+                    }
+                };
+
                 for (record_lsn, record) in records {
                     apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
                 }
@@ -8470,4 +8479,135 @@ mod tests {
 
         Ok(())
     }
+
+    // Regression test for https://github.com/neondatabase/neon/issues/9012
+    // Create an image arrangement where we have to read at different LSN ranges
+    // from a delta layer. This is achieved by overlapping an image layer on top of
+    // a delta layer. Like so:
+    //
+    //     A      B
+    // +----------------+ -> delta_layer
+    // |                |                           ^ lsn
+    // |       =========|-> nested_image_layer      |
+    // |       C        |                           |
+    // +----------------+                           |
+    // ======== -> baseline_image_layer             +-------> key
+    //
+    //
+    // When querying the key range [A, B) we need to read at different LSN ranges
+    // for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
+    #[tokio::test]
+    async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let mut delta_layer_spec = Vec::default();
+        let delta_layer_start_lsn = Lsn(0x20);
+        let mut delta_layer_end_lsn = delta_layer_start_lsn;
+
+        for i in 0..10 {
+            let key = get_key(i);
+            let key_in_nested = nested_img_layer
+                .iter()
+                .any(|(key_with_img, _)| *key_with_img == key);
+            let lsn = {
+                if key_in_nested {
+                    Lsn(nested_image_layer_lsn.0 + 0x10)
+                } else {
+                    delta_layer_start_lsn
+                }
+            };
+
+            let will_init = will_init_keys.contains(&i);
+            if will_init {
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
+
+                expected_key_values.insert(key, "".to_string());
+            } else {
+                let delta = format!("@{lsn}");
+                delta_layer_spec.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+            }
+            delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
+        }
+
+        delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);
+
+        assert!(
+            nested_image_layer_lsn > delta_layer_start_lsn
+                && nested_image_layer_lsn < delta_layer_end_lsn
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    delta_layer_start_lsn..delta_layer_end_lsn,
+                    delta_layer_spec,
+                )], // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                delta_layer_end_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, delta_layer_end_lsn, &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value));
+        }
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index dac6b2f89386..cd252aa37132 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -276,6 +276,16 @@ pub(crate) enum LayerId {
     InMemoryLayerId(InMemoryLayerFileId),
 }
 
+/// Uniquely identify a layer visit by the layer
+/// and LSN floor (or start LSN) of the reads.
+/// The layer itself is not enough since we may
+/// have different LSN lower bounds for delta layer reads.
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+struct LayerToVisitId {
+    layer_id: LayerId,
+    lsn_floor: Lsn,
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -287,9 +297,9 @@ pub(crate) enum ReadableLayer {
 
 /// A partial description of a read to be done.
 #[derive(Debug, Clone)]
-struct ReadDesc {
+struct LayerVisit {
     /// An id used to resolve the readable layer within the fringe
-    layer_id: LayerId,
+    layer_to_visit_id: LayerToVisitId,
     /// Lsn range for the read, used for selecting the next read
     lsn_range: Range<Lsn>,
 }
@@ -303,12 +313,12 @@ struct ReadDesc {
 /// a two layer indexing scheme.
 #[derive(Debug)]
 pub(crate) struct LayerFringe {
-    planned_reads_by_lsn: BinaryHeap<ReadDesc>,
-    layers: HashMap<LayerId, LayerKeyspace>,
+    planned_visits_by_lsn: BinaryHeap<LayerVisit>,
+    visit_reads: HashMap<LayerToVisitId, LayerVisitReads>,
 }
 
 #[derive(Debug)]
-struct LayerKeyspace {
+struct LayerVisitReads {
     layer: ReadableLayer,
     target_keyspace: KeySpaceRandomAccum,
 }
@@ -316,23 +326,23 @@ struct LayerKeyspace {
 impl LayerFringe {
     pub(crate) fn new() -> Self {
         LayerFringe {
-            planned_reads_by_lsn: BinaryHeap::new(),
-            layers: HashMap::new(),
+            planned_visits_by_lsn: BinaryHeap::new(),
+            visit_reads: HashMap::new(),
         }
     }
 
     pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range<Lsn>)> {
-        let read_desc = match self.planned_reads_by_lsn.pop() {
+        let read_desc = match self.planned_visits_by_lsn.pop() {
             Some(desc) => desc,
             None => return None,
         };
 
-        let removed = self.layers.remove_entry(&read_desc.layer_id);
+        let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id);
 
         match removed {
             Some((
                 _,
-                LayerKeyspace {
+                LayerVisitReads {
                     layer,
                     mut target_keyspace,
                 },
@@ -351,20 +361,24 @@ impl LayerFringe {
         keyspace: KeySpace,
         lsn_range: Range<Lsn>,
     ) {
-        let layer_id = layer.id();
-        let entry = self.layers.entry(layer_id.clone());
+        let layer_to_visit_id = LayerToVisitId {
+            layer_id: layer.id(),
+            lsn_floor: lsn_range.start,
+        };
+
+        let entry = self.visit_reads.entry(layer_to_visit_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
                 entry.get_mut().target_keyspace.add_keyspace(keyspace);
             }
             Entry::Vacant(entry) => {
-                self.planned_reads_by_lsn.push(ReadDesc {
+                self.planned_visits_by_lsn.push(LayerVisit {
                     lsn_range,
-                    layer_id: layer_id.clone(),
+                    layer_to_visit_id: layer_to_visit_id.clone(),
                 });
                 let mut accum = KeySpaceRandomAccum::new();
                 accum.add_keyspace(keyspace);
-                entry.insert(LayerKeyspace {
+                entry.insert(LayerVisitReads {
                     layer,
                     target_keyspace: accum,
                 });
@@ -379,7 +393,7 @@ impl Default for LayerFringe {
     }
 }
 
-impl Ord for ReadDesc {
+impl Ord for LayerVisit {
     fn cmp(&self, other: &Self) -> Ordering {
         let ord = self.lsn_range.end.cmp(&other.lsn_range.end);
         if ord == std::cmp::Ordering::Equal {
@@ -390,19 +404,19 @@ impl Ord for ReadDesc {
     }
 }
 
-impl PartialOrd for ReadDesc {
+impl PartialOrd for LayerVisit {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl PartialEq for ReadDesc {
+impl PartialEq for LayerVisit {
     fn eq(&self, other: &Self) -> bool {
         self.lsn_range == other.lsn_range
     }
 }
 
-impl Eq for ReadDesc {}
+impl Eq for LayerVisit {}
 
 impl ReadableLayer {
     pub(crate) fn id(&self) -> LayerId {

From ff9f065c4386496193d62f1ff1fadd28cce92910 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Sep 2024 10:43:12 -0400
Subject: [PATCH 13/77] impr(pageserver): log image layer creation (#9050)

https://github.com/neondatabase/neon/pull/9028 changed the image layer
creation log into trace level. However, I personally find logging image
layer creation useful when reading the logs -- it makes it clear that
the image layer creation is happening and gives a clear idea of the
progress. Therefore, I propose to continue logging them for
create_image_layers set of functions.

## Summary of changes

* Add info logging for all image layers created in legacy compaction.
* Add info logging for all layers creation in testing functions.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f66491d96293..a06cea2c6656 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4015,6 +4015,7 @@ impl Timeline {
             // partition, so flush it to disk.
             let (desc, path) = image_layer_writer.finish(ctx).await?;
             let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            info!("created image layer for rel {}", image_layer.local_path());
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -4104,6 +4105,10 @@ impl Timeline {
             // partition, so flush it to disk.
             let (desc, path) = image_layer_writer.finish(ctx).await?;
             let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+            info!(
+                "created image layer for metadata {}",
+                image_layer.local_path()
+            );
             Ok(ImageLayerCreationOutcome {
                 image: Some(image_layer),
                 next_start_key: img_range.end,
@@ -5407,7 +5412,7 @@ impl Timeline {
         }
         let (desc, path) = image_layer_writer.finish(ctx).await?;
         let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-
+        info!("force created image layer {}", image_layer.local_path());
         {
             let mut guard = self.layers.write().await;
             guard.open_mut().unwrap().force_insert_layer(image_layer);
@@ -5486,7 +5491,7 @@ impl Timeline {
         }
         let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
         let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
-
+        info!("force created delta layer {}", delta_layer.local_path());
         {
             let mut guard = self.layers.write().await;
             guard.open_mut().unwrap().force_insert_layer(delta_layer);

From 0a1ca7670cbaddb2e83b4e41142b8a7f5fcf0aef Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 19 Sep 2024 16:09:30 +0100
Subject: [PATCH 14/77] proxy: remove auth info from http conn info & fixup jwt
 api trait (#9047)

misc changes split out from #8855

- **allow cloning the request context in a read-only fashion for
background tasks**
- **propagate endpoint and request context through the jwk cache**
- **only allow password based auth for md5 during testing**
- **remove auth info from conn info**
---
 proxy/src/auth/backend.rs             | 15 ++++------
 proxy/src/auth/backend/hacks.rs       | 14 ++++-----
 proxy/src/auth/backend/jwt.rs         | 43 ++++++++++++++++++++++-----
 proxy/src/auth/backend/local.rs       | 10 +++++--
 proxy/src/console/provider.rs         |  1 +
 proxy/src/context.rs                  | 34 +++++++++++++++++++++
 proxy/src/metrics.rs                  | 20 +++++++++++++
 proxy/src/serverless/backend.rs       |  9 +-----
 proxy/src/serverless/conn_pool.rs     |  9 ++++--
 proxy/src/serverless/sql_over_http.rs | 21 ++++++-------
 test_runner/fixtures/neon_fixtures.py |  3 --
 11 files changed, 127 insertions(+), 52 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 5561c9c56db7..5bc2f2ff65b4 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -163,6 +163,7 @@ impl ComputeUserInfo {
 }
 
 pub(crate) enum ComputeCredentialKeys {
+    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
     None,
@@ -293,16 +294,10 @@ async fn auth_quirks(
     // We now expect to see a very specific payload in the place of password.
     let (info, unauthenticated_password) = match user_info.try_into() {
         Err(info) => {
-            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
-
-            ctx.set_endpoint_id(res.info.endpoint.clone());
-            let password = match res.keys {
-                ComputeCredentialKeys::Password(p) => p,
-                ComputeCredentialKeys::AuthKeys(_) | ComputeCredentialKeys::None => {
-                    unreachable!("password hack should return a password")
-                }
-            };
-            (res.info, Some(password))
+            let (info, password) =
+                hacks::password_hack_no_authentication(ctx, info, client).await?;
+            ctx.set_endpoint_id(info.endpoint.clone());
+            (info, Some(password))
         }
         Ok(info) => (info, None),
     };
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index e9019ce2cf8c..15123a262337 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,6 +1,4 @@
-use super::{
-    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
-};
+use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
 use crate::{
     auth::{self, AuthFlow},
     config::AuthenticationConfig,
@@ -63,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
     ctx: &RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-) -> auth::Result<ComputeCredentials> {
+) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
     warn!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -79,12 +77,12 @@ pub(crate) async fn password_hack_no_authentication(
     info!(project = &*payload.endpoint, "received missing parameter");
 
     // Report tentative success; compute node will check the password anyway.
-    Ok(ComputeCredentials {
-        info: ComputeUserInfo {
+    Ok((
+        ComputeUserInfo {
             user: info.user,
             options: info.options,
             endpoint: payload.endpoint,
         },
-        keys: ComputeCredentialKeys::Password(payload.password),
-    })
+        payload.password,
+    ))
 }
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 1f44e4af5d67..94e5999a5f2f 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -25,6 +25,8 @@ const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
     fn fetch_auth_rules(
         &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
         role_name: RoleName,
     ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
 }
@@ -101,7 +103,9 @@ impl JwkCacheEntryLock {
     async fn renew_jwks<F: FetchAuthRules>(
         &self,
         _permit: JwkRenewalPermit<'_>,
+        ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         auth_rules: &F,
     ) -> anyhow::Result<Arc<JwkCacheEntry>> {
@@ -115,7 +119,9 @@ impl JwkCacheEntryLock {
             }
         }
 
-        let rules = auth_rules.fetch_auth_rules(role_name).await?;
+        let rules = auth_rules
+            .fetch_auth_rules(ctx, endpoint, role_name)
+            .await?;
         let mut key_sets =
             ahash::HashMap::with_capacity_and_hasher(rules.len(), ahash::RandomState::new());
         // TODO(conrad): run concurrently
@@ -166,6 +172,7 @@ impl JwkCacheEntryLock {
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         fetch: &F,
     ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
@@ -176,7 +183,9 @@ impl JwkCacheEntryLock {
         let Some(cached) = guard else {
             let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
             let permit = self.acquire_permit().await;
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self
+                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
+                .await;
         };
 
         let last_update = now.duration_since(cached.last_retrieved);
@@ -187,7 +196,9 @@ impl JwkCacheEntryLock {
             let permit = self.acquire_permit().await;
 
             // it's been too long since we checked the keys. wait for them to update.
-            return self.renew_jwks(permit, client, role_name, fetch).await;
+            return self
+                .renew_jwks(permit, ctx, client, endpoint, role_name, fetch)
+                .await;
         }
 
         // every 5 minutes we should spawn a job to eagerly update the token.
@@ -198,8 +209,12 @@ impl JwkCacheEntryLock {
                 let entry = self.clone();
                 let client = client.clone();
                 let fetch = fetch.clone();
+                let ctx = ctx.clone();
                 tokio::spawn(async move {
-                    if let Err(e) = entry.renew_jwks(permit, &client, role_name, &fetch).await {
+                    if let Err(e) = entry
+                        .renew_jwks(permit, &ctx, &client, endpoint, role_name, &fetch)
+                        .await
+                    {
                         tracing::warn!(error=?e, "could not fetch JWKs in background job");
                     }
                 });
@@ -216,6 +231,7 @@ impl JwkCacheEntryLock {
         ctx: &RequestMonitoring,
         jwt: &str,
         client: &reqwest::Client,
+        endpoint: EndpointId,
         role_name: RoleName,
         fetch: &F,
     ) -> Result<(), anyhow::Error> {
@@ -242,7 +258,7 @@ impl JwkCacheEntryLock {
         let kid = header.key_id.context("missing key id")?;
 
         let mut guard = self
-            .get_or_update_jwk_cache(ctx, client, role_name.clone(), fetch)
+            .get_or_update_jwk_cache(ctx, client, endpoint.clone(), role_name.clone(), fetch)
             .await?;
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
@@ -254,7 +270,14 @@ impl JwkCacheEntryLock {
 
                     let permit = self.acquire_permit().await;
                     guard = self
-                        .renew_jwks(permit, client, role_name.clone(), fetch)
+                        .renew_jwks(
+                            permit,
+                            ctx,
+                            client,
+                            endpoint.clone(),
+                            role_name.clone(),
+                            fetch,
+                        )
                         .await?;
                 }
                 _ => {
@@ -318,7 +341,7 @@ impl JwkCache {
         jwt: &str,
     ) -> Result<(), anyhow::Error> {
         // try with just a read lock first
-        let key = (endpoint, role_name.clone());
+        let key = (endpoint.clone(), role_name.clone());
         let entry = self.map.get(&key).as_deref().map(Arc::clone);
         let entry = entry.unwrap_or_else(|| {
             // acquire a write lock after to insert.
@@ -327,7 +350,7 @@ impl JwkCache {
         });
 
         entry
-            .check_jwt(ctx, jwt, &self.client, role_name, fetch)
+            .check_jwt(ctx, jwt, &self.client, endpoint, role_name, fetch)
             .await
     }
 }
@@ -688,6 +711,8 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         impl FetchAuthRules for Fetch {
             async fn fetch_auth_rules(
                 &self,
+                _ctx: &RequestMonitoring,
+                _endpoint: EndpointId,
                 _role_name: RoleName,
             ) -> anyhow::Result<Vec<AuthRule>> {
                 Ok(vec![
@@ -706,6 +731,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         }
 
         let role_name = RoleName::from("user");
+        let endpoint = EndpointId::from("ep");
 
         let jwk_cache = Arc::new(JwkCacheEntryLock::default());
 
@@ -715,6 +741,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                     &RequestMonitoring::test(),
                     &token,
                     &client,
+                    endpoint.clone(),
                     role_name.clone(),
                     &Fetch(addr),
                 )
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 8124f568cf0b..2ff2ca00f091 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -9,8 +9,9 @@ use crate::{
         messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
         NodeInfo,
     },
+    context::RequestMonitoring,
     intern::{BranchIdInt, BranchIdTag, EndpointIdTag, InternId, ProjectIdInt, ProjectIdTag},
-    RoleName,
+    EndpointId, RoleName,
 };
 
 use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
@@ -57,7 +58,12 @@ pub struct JwksRoleSettings {
 }
 
 impl FetchAuthRules for StaticAuthRules {
-    async fn fetch_auth_rules(&self, role_name: RoleName) -> anyhow::Result<Vec<AuthRule>> {
+    async fn fetch_auth_rules(
+        &self,
+        _ctx: &RequestMonitoring,
+        _endpoint: EndpointId,
+        role_name: RoleName,
+    ) -> anyhow::Result<Vec<AuthRule>> {
         let mappings = JWKS_ROLE_MAP.load();
         let role_mappings = mappings
             .as_deref()
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 12a6e2f12a6c..16e8da605b7a 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -303,6 +303,7 @@ impl NodeInfo {
 
     pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
         match keys {
+            #[cfg(any(test, feature = "testing"))]
             ComputeCredentialKeys::Password(password) => self.config.password(password),
             ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
             ComputeCredentialKeys::None => &mut self.config,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index c013218ad91f..021659e175a1 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -79,6 +79,40 @@ pub(crate) enum AuthMethod {
     Cleartext,
 }
 
+impl Clone for RequestMonitoring {
+    fn clone(&self) -> Self {
+        let inner = self.0.try_lock().expect("should not deadlock");
+        let new = RequestMonitoringInner {
+            peer_addr: inner.peer_addr,
+            session_id: inner.session_id,
+            protocol: inner.protocol,
+            first_packet: inner.first_packet,
+            region: inner.region,
+            span: info_span!("background_task"),
+
+            project: inner.project,
+            branch: inner.branch,
+            endpoint_id: inner.endpoint_id.clone(),
+            dbname: inner.dbname.clone(),
+            user: inner.user.clone(),
+            application: inner.application.clone(),
+            error_kind: inner.error_kind,
+            auth_method: inner.auth_method.clone(),
+            success: inner.success,
+            rejected: inner.rejected,
+            cold_start_info: inner.cold_start_info,
+            pg_options: inner.pg_options.clone(),
+
+            sender: None,
+            disconnect_sender: None,
+            latency_timer: LatencyTimer::noop(inner.protocol),
+            disconnect_timestamp: inner.disconnect_timestamp,
+        };
+
+        Self(TryLock::new(new))
+    }
+}
+
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 2da7eac58060..c2567e083acd 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -397,6 +397,8 @@ pub struct LatencyTimer {
     protocol: Protocol,
     cold_start_info: ColdStartInfo,
     outcome: ConnectOutcome,
+
+    skip_reporting: bool,
 }
 
 impl LatencyTimer {
@@ -409,6 +411,20 @@ impl LatencyTimer {
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
             outcome: ConnectOutcome::Failed,
+            skip_reporting: false,
+        }
+    }
+
+    pub(crate) fn noop(protocol: Protocol) -> Self {
+        Self {
+            start: time::Instant::now(),
+            stop: None,
+            accumulated: Accumulated::default(),
+            protocol,
+            cold_start_info: ColdStartInfo::Unknown,
+            // assume failed unless otherwise specified
+            outcome: ConnectOutcome::Failed,
+            skip_reporting: true,
         }
     }
 
@@ -443,6 +459,10 @@ pub enum ConnectOutcome {
 
 impl Drop for LatencyTimer {
     fn drop(&mut self) {
+        if self.skip_reporting {
+            return;
+        }
+
         let duration = self
             .stop
             .unwrap_or_else(time::Instant::now)
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index d163878528b3..aa236907db19 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -27,7 +27,7 @@ use crate::{
     Host,
 };
 
-use super::conn_pool::{poll_client, AuthData, Client, ConnInfo, GlobalConnPool};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 
 pub(crate) struct PoolingBackend {
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -274,13 +274,6 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
-        match &self.conn_info.auth {
-            AuthData::Jwt(_) => {}
-            AuthData::Password(pw) => {
-                config.password(pw);
-            }
-        }
-
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 6c32d5df0e89..a850ecd2be45 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -29,11 +29,16 @@ use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 
+#[derive(Debug, Clone)]
+pub(crate) struct ConnInfoWithAuth {
+    pub(crate) conn_info: ConnInfo,
+    pub(crate) auth: AuthData,
+}
+
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
     pub(crate) user_info: ComputeUserInfo,
     pub(crate) dbname: DbName,
-    pub(crate) auth: AuthData,
 }
 
 #[derive(Debug, Clone)]
@@ -787,7 +792,6 @@ mod tests {
                 options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
-            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
@@ -845,7 +849,6 @@ mod tests {
                 options: NeonOptions::default(),
             },
             dbname: "dbname".into(),
-            auth: AuthData::Password("password".as_bytes().into()),
         };
         let ep_pool = Arc::downgrade(
             &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 06e540d149ac..7c78439a0a0f 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -60,6 +60,7 @@ use super::backend::PoolingBackend;
 use super::conn_pool::AuthData;
 use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
+use super::conn_pool::ConnInfoWithAuth;
 use super::http_util::json_response;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
@@ -148,7 +149,7 @@ fn get_conn_info(
     ctx: &RequestMonitoring,
     headers: &HeaderMap,
     tls: Option<&TlsConfig>,
-) -> Result<ConnInfo, ConnInfoError> {
+) -> Result<ConnInfoWithAuth, ConnInfoError> {
     // HTTP only uses cleartext (for now and likely always)
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -235,11 +236,8 @@ fn get_conn_info(
         options: options.unwrap_or_default(),
     };
 
-    Ok(ConnInfo {
-        user_info,
-        dbname,
-        auth,
-    })
+    let conn_info = ConnInfo { user_info, dbname };
+    Ok(ConnInfoWithAuth { conn_info, auth })
 }
 
 // TODO: return different http error codes
@@ -523,7 +521,10 @@ async fn handle_inner(
 
     // TLS config should be there.
     let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?;
-    info!(user = conn_info.user_info.user.as_str(), "credentials");
+    info!(
+        user = conn_info.conn_info.user_info.user.as_str(),
+        "credentials"
+    );
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
@@ -568,20 +569,20 @@ async fn handle_inner(
                         .authenticate_with_password(
                             ctx,
                             &config.authentication_config,
-                            &conn_info.user_info,
+                            &conn_info.conn_info.user_info,
                             pw,
                         )
                         .await?
                 }
                 AuthData::Jwt(jwt) => {
                     backend
-                        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
+                        .authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt)
                         .await?
                 }
             };
 
             let client = backend
-                .connect_to_compute(ctx, conn_info, keys, !allow_pool)
+                .connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool)
                 .await?;
             // not strictly necessary to mark success here,
             // but it's just insurance for if we forget it somewhere else
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cbbb162cc64c..fc83cf3f7c6a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3863,9 +3863,6 @@ def static_proxy(
     dbname = vanilla_pg.default_options["dbname"]
     auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"
 
-    # require password for 'http_auth' user
-    vanilla_pg.edit_hba([f"host {dbname} http_auth {host} password"])
-
     # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
     vanilla_pg.start()
     vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")

From 1708743e786cde41eb1bba51d4e5267895d8227d Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Thu, 19 Sep 2024 12:27:10 -0400
Subject: [PATCH 15/77] pageserver: wait for lsn lease duration after
 transition into AttachedSingle (#9024)

Part of #7497, closes https://github.com/neondatabase/neon/issues/8890.

## Problem

Since leases are in-memory objects, we need to take special care of them
after pageserver restarts and while doing a live migration. The approach
we took for pageserver restart is to wait for at least lease duration
before doing first GC. We want to do the same for live migration. Since
we do not do any GC when a tenant is in `AttachedStale` or
`AttachedMulti` mode, only the transition from `AttachedMulti` to
`AttachedSingle` requires this treatment.

## Summary of changes

- Added `lsn_lease_deadline` field in `GcBlock::reasons`: the tenant is
temporarily blocked from GC until we reach the deadline. This
information does not persist to S3.
- In `GCBlock::start`, skip the GC iteration if we are blocked by the
lsn lease deadline.
- In `TenantManager::upsert_location`, set the lsn_lease_deadline to
`Instant::now() + lsn_lease_length` so the granted leases have a chance
to be renewed before we run GC for the first time after transitioned
from AttachedMulti to AttachedSingle.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 pageserver/src/tenant/gc_block.rs             | 81 ++++++++++++++-----
 pageserver/src/tenant/mgr.rs                  |  6 ++
 pageserver/src/tenant/tasks.rs                | 18 +----
 test_runner/regress/test_branch_and_gc.py     |  1 +
 test_runner/regress/test_branch_behind.py     |  4 +-
 test_runner/regress/test_branching.py         |  2 +-
 test_runner/regress/test_compaction.py        |  1 +
 test_runner/regress/test_hot_standby.py       |  2 +-
 test_runner/regress/test_layer_eviction.py    |  1 +
 .../regress/test_pageserver_generations.py    |  1 +
 test_runner/regress/test_remote_storage.py    |  2 +
 test_runner/regress/test_sharding.py          |  1 +
 .../regress/test_storage_controller.py        |  2 +-
 test_runner/regress/test_storage_scrubber.py  |  1 +
 test_runner/regress/test_tenant_detach.py     |  4 +-
 .../regress/test_timeline_gc_blocking.py      |  5 +-
 16 files changed, 90 insertions(+), 42 deletions(-)

diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
index 8b41ba174669..1271d25b7659 100644
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,11 +1,29 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, time::Duration};
 
+use super::remote_timeline_client::index::GcBlockingReason;
+use tokio::time::Instant;
 use utils::id::TimelineId;
 
-use super::remote_timeline_client::index::GcBlockingReason;
+type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
 
-type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
+#[derive(Default)]
+struct Storage {
+    timelines_blocked: TimelinesBlocked,
+    /// The deadline before which we are blocked from GC so that
+    /// leases have a chance to be renewed.
+    lsn_lease_deadline: Option<Instant>,
+}
+
+impl Storage {
+    fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
+        self.lsn_lease_deadline
+            .map(|d| Instant::now() < d)
+            .unwrap_or(false)
+    }
+}
 
+/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
+/// blocking.
 #[derive(Default)]
 pub(crate) struct GcBlock {
     /// The timelines which have current reasons to block gc.
@@ -13,6 +31,12 @@ pub(crate) struct GcBlock {
     /// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
     /// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
     reasons: std::sync::Mutex<Storage>,
+
+    /// GC background task or manually run `Tenant::gc_iteration` holds a lock on this.
+    ///
+    /// Do not add any more features taking and forbidding taking this lock. It should be
+    /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
+    /// synchronizes with gc attempts by locking and unlocking this mutex.
     blocking: tokio::sync::Mutex<()>,
 }
 
@@ -42,6 +66,20 @@ impl GcBlock {
         }
     }
 
+    /// Sets a deadline before which we cannot proceed to GC due to lsn lease.
+    ///
+    /// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
+    /// length, we guarantee that all the leases we granted before will have a chance to renew
+    /// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
+    pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
+        let deadline = Instant::now() + lsn_lease_length;
+        let mut g = self.reasons.lock().unwrap();
+        g.lsn_lease_deadline = Some(deadline);
+    }
+
+    /// Describe the current gc blocking reasons.
+    ///
+    /// TODO: make this json serializable.
     pub(crate) fn summary(&self) -> Option<BlockingReasons> {
         let g = self.reasons.lock().unwrap();
 
@@ -64,7 +102,7 @@ impl GcBlock {
     ) -> anyhow::Result<bool> {
         let (added, uploaded) = {
             let mut g = self.reasons.lock().unwrap();
-            let set = g.entry(timeline.timeline_id).or_default();
+            let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
             let added = set.insert(reason);
 
             // LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -95,7 +133,7 @@ impl GcBlock {
 
         let (remaining_blocks, uploaded) = {
             let mut g = self.reasons.lock().unwrap();
-            match g.entry(timeline.timeline_id) {
+            match g.timelines_blocked.entry(timeline.timeline_id) {
                 Entry::Occupied(mut oe) => {
                     let set = oe.get_mut();
                     set.remove(reason);
@@ -109,7 +147,7 @@ impl GcBlock {
                 }
             }
 
-            let remaining_blocks = g.len();
+            let remaining_blocks = g.timelines_blocked.len();
 
             // LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
             let uploaded = timeline
@@ -134,11 +172,11 @@ impl GcBlock {
     pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
         let unblocked = {
             let mut g = self.reasons.lock().unwrap();
-            if g.is_empty() {
+            if g.timelines_blocked.is_empty() {
                 return;
             }
 
-            g.remove(&timeline.timeline_id);
+            g.timelines_blocked.remove(&timeline.timeline_id);
 
             BlockingReasons::clean_and_summarize(g).is_none()
         };
@@ -149,10 +187,11 @@ impl GcBlock {
     }
 
     /// Initialize with the non-deleted timelines of this tenant.
-    pub(crate) fn set_scanned(&self, scanned: Storage) {
+    pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
         let mut g = self.reasons.lock().unwrap();
-        assert!(g.is_empty());
-        g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
+        assert!(g.timelines_blocked.is_empty());
+        g.timelines_blocked
+            .extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
 
         if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
             tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -166,6 +205,7 @@ pub(super) struct Guard<'a> {
 
 #[derive(Debug)]
 pub(crate) struct BlockingReasons {
+    tenant_blocked_by_lsn_lease_deadline: bool,
     timelines: usize,
     reasons: enumset::EnumSet<GcBlockingReason>,
 }
@@ -174,8 +214,8 @@ impl std::fmt::Display for BlockingReasons {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
-            "{} timelines block for {:?}",
-            self.timelines, self.reasons
+            "tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
+            self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
         )
     }
 }
@@ -183,13 +223,15 @@ impl std::fmt::Display for BlockingReasons {
 impl BlockingReasons {
     fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
         let mut reasons = enumset::EnumSet::empty();
-        g.retain(|_key, value| {
+        g.timelines_blocked.retain(|_key, value| {
             reasons = reasons.union(*value);
             !value.is_empty()
         });
-        if !g.is_empty() {
+        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
+        if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
             Some(BlockingReasons {
-                timelines: g.len(),
+                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
+                timelines: g.timelines_blocked.len(),
                 reasons,
             })
         } else {
@@ -198,14 +240,17 @@ impl BlockingReasons {
     }
 
     fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
-        if g.is_empty() {
+        let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
+        if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
             None
         } else {
             let reasons = g
+                .timelines_blocked
                 .values()
                 .fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
             Some(BlockingReasons {
-                timelines: g.len(),
+                tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
+                timelines: g.timelines_blocked.len(),
                 reasons,
             })
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 2104f415319e..1e7c1e10a5e7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -949,6 +949,12 @@ impl TenantManager {
                 (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
                     match attach_conf.generation.cmp(&tenant.generation) {
                         Ordering::Equal => {
+                            if attach_conf.attach_mode == AttachmentMode::Single {
+                                tenant
+                                    .gc_block
+                                    .set_lsn_lease_deadline(tenant.get_lsn_lease_length());
+                            }
+
                             // A transition from Attached to Attached in the same generation, we may
                             // take our fast path and just provide the updated configuration
                             // to the tenant.
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 478e9bb4f074..57f0123d8fa3 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -346,6 +346,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
 
         let mut first = true;
+        tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
         loop {
             tokio::select! {
                 _ = cancel.cancelled() => {
@@ -363,7 +364,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 first = false;
 
                 let delays = async {
-                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
                     random_init_delay(period, &cancel).await?;
                     Ok::<_, Cancelled>(())
                 };
@@ -538,28 +538,12 @@ pub(crate) async fn random_init_delay(
         let mut rng = rand::thread_rng();
         rng.gen_range(Duration::ZERO..=period)
     };
-
     match tokio::time::timeout(d, cancel.cancelled()).await {
         Ok(_) => Err(Cancelled),
         Err(_) => Ok(()),
     }
 }
 
-/// Delays GC by defaul lease length at restart.
-///
-/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
-/// length, we gurantees that all the leases we granted before the restart will expire
-/// when we run GC for the first time after the restart.
-pub(crate) async fn delay_by_lease_length(
-    length: Duration,
-    cancel: &CancellationToken,
-) -> Result<(), Cancelled> {
-    match tokio::time::timeout(length, cancel.cancelled()).await {
-        Ok(_) => Err(Cancelled),
-        Err(_) => Ok(()),
-    }
-}
-
 struct Iteration {
     started_at: Instant,
     period: Duration,
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index f2e3855c123e..d7c4cf059a4e 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -142,6 +142,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
             "image_creation_threshold": "1",
             # set PITR interval to be small, so we can do GC
             "pitr_interval": "0 s",
+            "lsn_lease_length": "0s",
         }
     )
 
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index 0a5336f5a246..2bf7041cf14b 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -11,7 +11,9 @@
 #
 def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"pitr_interval": "0 sec", "lsn_lease_length": "0s"}
+    )
 
     error_regexes = [
         ".*invalid branch start lsn.*",
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 1729e2fc9887..3d5c34a5958b 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -419,7 +419,7 @@ def start_creating_timeline():
 
 
 def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
 
     client = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index be787e064262..cb34551b53fc 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -240,6 +240,7 @@ def test_uploads_and_deletions(
         "image_creation_threshold": "1",
         "image_layer_creation_check_threshold": "0",
         "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}),
+        "lsn_lease_length": "0s",
     }
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
 
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index d94704012fad..35e0c0decb26 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -222,7 +222,7 @@ def pgbench_accounts_initialized(ep):
 # Without hs feedback enabled we'd see 'User query might have needed to see row
 # versions that must be removed.' errors.
 def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
     agressive_vacuum_conf = [
         "log_autovacuum_min_duration = 0",
         "autovacuum_naptime = 10s",
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 193149ea0388..97093ea535c2 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -173,6 +173,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
         # "image_creation_threshold": set at runtime
         "compaction_target_size": f"{128 * (1024**2)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
         "image_layer_creation_check_threshold": "0",  # always check if a new image layer can be created
+        "lsn_lease_length": "0s",
     }
 
     def tenant_update_config(changes):
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index c92371343233..519994f77446 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -53,6 +53,7 @@
     # create image layers eagerly, so that GC can remove some layers
     "image_creation_threshold": "1",
     "image_layer_creation_check_threshold": "0",
+    "lsn_lease_length": "0s",
 }
 
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 2e5260ca781a..0a57fc960563 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -244,6 +244,7 @@ def test_remote_storage_upload_queue_retries(
             # create image layers eagerly, so that GC can remove some layers
             "image_creation_threshold": "1",
             "image_layer_creation_check_threshold": "0",
+            "lsn_lease_length": "0s",
         }
     )
 
@@ -391,6 +392,7 @@ def test_remote_timeline_client_calls_started_metric(
             # disable background compaction and GC. We invoke it manually when we want it to happen.
             "gc_period": "0s",
             "compaction_period": "0s",
+            "lsn_lease_length": "0s",
         }
     )
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 4a84dca399a3..1eb33b2d39ca 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -200,6 +200,7 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
         # Disable automatic creation of image layers, as we will create them explicitly when we want them
         "image_creation_threshold": 9999,
         "image_layer_creation_check_threshold": 0,
+        "lsn_lease_length": "0s",
     }
 
     neon_env_builder.storage_controller_config = {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index dc90a6e9a087..4106efd4f9cc 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -485,7 +485,7 @@ def handler(request: Request):
     httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
 
     # Start running
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
 
     # Initial notification from tenant creation
     assert len(notifications) == 1
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 848e214c5e46..b6c19f03f6ab 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -204,6 +204,7 @@ def test_scrubber_physical_gc_ancestors(
             # No PITR, so that as soon as child shards generate an image layer, it covers ancestor deltas
             # and makes them GC'able
             "pitr_interval": "0s",
+            "lsn_lease_length": "0s",
         },
     )
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index b165588636c7..e7c6d5a4c382 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -266,13 +266,13 @@ async def reattach_while_busy(
 
 
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
     pageserver_http = env.pageserver.http_client()
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
     # create new nenant
-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
 
     # assert tenant exists on disk
     assert env.pageserver.tenant_dir(tenant_id).exists()
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index ddfe9b911fd8..765c72cf2a2f 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -45,7 +45,10 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
     tenant_after = http.tenant_status(env.initial_tenant)
     assert tenant_before != tenant_after
     gc_blocking = tenant_after["gc_blocking"]
-    assert gc_blocking == "BlockingReasons { timelines: 1, reasons: EnumSet(Manual) }"
+    assert (
+        gc_blocking
+        == "BlockingReasons { tenant_blocked_by_lsn_lease_deadline: false, timelines: 1, reasons: EnumSet(Manual) }"
+    )
 
     wait_for_another_gc_round()
     pss.assert_log_contains(gc_skipped_line)

From d0cbfda15c916ee75066919940d0c3da714c5b95 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 19 Sep 2024 16:29:28 -0400
Subject: [PATCH 16/77] refactor(pageserver): check layer map valid in one
 place (#9051)

We have 3 places where we implement layer map checks.

## Summary of changes

Now we have a single check function being called in all places.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                     | 31 +++++++++++
 pageserver/src/tenant/checks.rs              | 55 +++++++++++++++++++
 pageserver/src/tenant/timeline.rs            | 33 ++----------
 pageserver/src/tenant/timeline/compaction.rs | 21 +++-----
 storage_scrubber/src/checks.rs               | 56 ++------------------
 5 files changed, 101 insertions(+), 95 deletions(-)
 create mode 100644 pageserver/src/tenant/checks.rs

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 14cb6f508db2..d699d560751e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -140,6 +140,7 @@ pub mod metadata;
 pub mod remote_timeline_client;
 pub mod storage_layer;
 
+pub mod checks;
 pub mod config;
 pub mod mgr;
 pub mod secondary;
@@ -1573,6 +1574,9 @@ impl Tenant {
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
+        use checks::check_valid_layermap;
+        use itertools::Itertools;
+
         let tline = self
             .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
             .await?;
@@ -1587,6 +1591,18 @@ impl Tenant {
                 .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                 .await?;
         }
+        let layer_names = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|layer| layer.layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("invalid layermap: {err}");
+        }
         Ok(tline)
     }
 
@@ -3197,6 +3213,9 @@ impl Tenant {
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
+        use checks::check_valid_layermap;
+        use itertools::Itertools;
+
         let tline = self
             .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
             .await?;
@@ -3217,6 +3236,18 @@ impl Tenant {
                 .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
                 .await?;
         }
+        let layer_names = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .unwrap()
+            .iter_historic_layers()
+            .map(|layer| layer.layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("invalid layermap: {err}");
+        }
         Ok(tline)
     }
 
diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs
new file mode 100644
index 000000000000..8eaa8a001c48
--- /dev/null
+++ b/pageserver/src/tenant/checks.rs
@@ -0,0 +1,55 @@
+use std::collections::BTreeSet;
+
+use itertools::Itertools;
+
+use super::storage_layer::LayerName;
+
+/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
+///
+/// ```plain
+/// |       |                 |       |
+/// |   1   |    |   2   |    |   3   |
+/// |       |    |       |    |       |
+/// ```
+///
+/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
+/// the same LSN range.
+///
+/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
+///
+/// ```plain
+/// |       |    |   2   |    |       |
+/// |   1   |    |-------|    |   3   |
+/// |       |    |   4   |    |       |
+///
+/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
+pub fn check_valid_layermap(metadata: &[LayerName]) -> Option<String> {
+    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+    let mut all_delta_layers = Vec::new();
+    for name in metadata {
+        if let LayerName::Delta(layer) = name {
+            if layer.key_range.start.next() != layer.key_range.end {
+                all_delta_layers.push(layer.clone());
+            }
+        }
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = &layer.lsn_range;
+        lsn_split_point.insert(lsn_range.start);
+        lsn_split_point.insert(lsn_range.end);
+    }
+    for layer in &all_delta_layers {
+        let lsn_range = layer.lsn_range.clone();
+        let intersects = lsn_split_point.range(lsn_range).collect_vec();
+        if intersects.len() > 1 {
+            let err = format!(
+                "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
+                layer,
+                intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
+            );
+            return Some(err);
+        }
+    }
+    None
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a06cea2c6656..f08f5caf95f7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5378,7 +5378,8 @@ impl Timeline {
     /// Force create an image layer and place it into the layer map.
     ///
     /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// placed into the layer map in one run AND be validated.
     #[cfg(test)]
     pub(super) async fn force_create_image_layer(
         self: &Arc<Timeline>,
@@ -5424,7 +5425,8 @@ impl Timeline {
     /// Force create a delta layer and place it into the layer map.
     ///
     /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// placed into the layer map in one run AND be validated.
     #[cfg(test)]
     pub(super) async fn force_create_delta_layer(
         self: &Arc<Timeline>,
@@ -5450,33 +5452,6 @@ impl Timeline {
         if let Some(check_start_lsn) = check_start_lsn {
             assert!(deltas.lsn_range.start >= check_start_lsn);
         }
-        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
-        // layers of the same start/end LSN, and so should the force inserted layer
-        {
-            /// Checks if a overlaps with b, assume a/b = [start, end).
-            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-                !(a.end <= b.start || b.end <= a.start)
-            }
-
-            if deltas.key_range.start.next() != deltas.key_range.end {
-                let guard = self.layers.read().await;
-                let mut invalid_layers =
-                    guard.layer_map()?.iter_historic_layers().filter(|layer| {
-                        layer.is_delta()
-                        && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
-                        && layer.lsn_range != deltas.lsn_range
-                        // skip single-key layer files
-                        && layer.key_range.start.next() != layer.key_range.end
-                    });
-                if let Some(layer) = invalid_layers.next() {
-                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
-                    panic!(
-                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
-                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
-                    );
-                }
-            }
-        }
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d1f06e3480d7..d1567b6b39f3 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,6 +29,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
@@ -1788,20 +1789,12 @@ impl Timeline {
                 stat.visit_image_layer(desc.file_size());
             }
         }
-        for layer in &layer_selection {
-            let desc = layer.layer_desc();
-            let key_range = &desc.key_range;
-            if desc.is_delta() && key_range.start.next() != key_range.end {
-                let lsn_range = desc.lsn_range.clone();
-                let intersects = lsn_split_point.range(lsn_range).collect_vec();
-                if intersects.len() > 1 {
-                    bail!(
-                        "cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                        desc.key(),
-                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-                    );
-                }
-            }
+        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
+            .iter()
+            .map(|layer| layer.layer_desc().layer_name())
+            .collect_vec();
+        if let Some(err) = check_valid_layermap(&layer_names) {
+            bail!("cannot run gc-compaction because {}", err);
         }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = layer_selection
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 15dfb101b53e..de6918b3da26 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,7 +1,8 @@
-use std::collections::{BTreeSet, HashMap, HashSet};
+use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use itertools::Itertools;
+use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
@@ -48,56 +49,6 @@ impl TimelineAnalysis {
     }
 }
 
-/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
-/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
-///
-/// ```plain
-/// |       |                 |       |
-/// |   1   |    |   2   |    |   3   |
-/// |       |    |       |    |       |
-/// ```
-///
-/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have
-/// the same LSN range.
-///
-/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example,
-///
-/// ```plain
-/// |       |    |   2   |    |       |
-/// |   1   |    |-------|    |   3   |
-/// |       |    |   4   |    |       |
-///
-/// If layer 2 and 4 contain the same single key, this is also a valid layer map.
-fn check_valid_layermap(metadata: &HashMap<LayerName, LayerFileMetadata>) -> Option<String> {
-    let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
-    let mut all_delta_layers = Vec::new();
-    for (name, _) in metadata.iter() {
-        if let LayerName::Delta(layer) = name {
-            if layer.key_range.start.next() != layer.key_range.end {
-                all_delta_layers.push(layer.clone());
-            }
-        }
-    }
-    for layer in &all_delta_layers {
-        let lsn_range = &layer.lsn_range;
-        lsn_split_point.insert(lsn_range.start);
-        lsn_split_point.insert(lsn_range.end);
-    }
-    for layer in &all_delta_layers {
-        let lsn_range = layer.lsn_range.clone();
-        let intersects = lsn_split_point.range(lsn_range).collect_vec();
-        if intersects.len() > 1 {
-            let err = format!(
-                        "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
-                        layer,
-                        intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
-                    );
-            return Some(err);
-        }
-    }
-    None
-}
-
 pub(crate) async fn branch_cleanup_and_check_errors(
     remote_client: &GenericRemoteStorage,
     id: &TenantShardTimelineId,
@@ -177,7 +128,8 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                         }
                     }
 
-                    if let Some(err) = check_valid_layermap(&index_part.layer_metadata) {
+                    let layer_names = index_part.layer_metadata.keys().cloned().collect_vec();
+                    if let Some(err) = check_valid_layermap(&layer_names) {
                         result.errors.push(format!(
                             "index_part.json contains invalid layer map structure: {err}"
                         ));

From f2c08195f068ec4445347641cdde67eb170e60cc Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 17 Sep 2024 11:32:49 +0300
Subject: [PATCH 17/77] Bump vendor/postgres.

Includes PRs:
- ERROR out instead of segfaulting when walsender slots are full.
- logical worker: respond to publisher even under dense stream.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a317b9b5b969..87cb68f899db 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7
+Subproject commit 87cb68f899db434cd6f1908cf0ac8fdeafdd88c1
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 6f6d77fb5960..72b904c0b3ac 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 6f6d77fb5960602fcd3fd130aca9f99ecb1619c9
+Subproject commit 72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 0baa7346dfd4..3ec6e2496f64 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 0baa7346dfd42d61912eeca554c9bb0a190f0a1e
+Subproject commit 3ec6e2496f64c6fec35c67cb82efd6490a6a4738
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 9156d63ce253..5bbb9bd93dd8 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 9156d63ce253bed9d1f76355ceec610e444eaffa
+Subproject commit 5bbb9bd93dd805e90bd8af15d00080363d18ec68
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c2c34962bb3c..6289a53670b6 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "9156d63ce253bed9d1f76355ceec610e444eaffa"
+    "5bbb9bd93dd805e90bd8af15d00080363d18ec68"
   ],
   "v16": [
     "16.4",
-    "0baa7346dfd42d61912eeca554c9bb0a190f0a1e"
+    "3ec6e2496f64c6fec35c67cb82efd6490a6a4738"
   ],
   "v15": [
     "15.8",
-    "6f6d77fb5960602fcd3fd130aca9f99ecb1619c9"
+    "72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1"
   ],
   "v14": [
     "14.13",
-    "a317b9b5b96978b49e78986697f3dd80d06f99a7"
+    "87cb68f899db434cd6f1908cf0ac8fdeafdd88c1"
   ]
 }

From 3104f0f250e99dea03817c8ee6fd4022844db6ea Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 20 Sep 2024 12:00:05 +0100
Subject: [PATCH 18/77] Safekeeper: fix OpenAPI spec (#9066)

## Problem

Safekeeper's OpenAPI spec is incorrect:

```
Semantic error at paths./v1/tenant/{tenant_id}/timeline/{timeline_id}.get.responses.404.content.application/json.schema.$ref
$refs must reference a valid location in the document
Jump to line 126
```
Checked on https://editor.swagger.io

## Summary of changes
- Add `NotFoundError`
- Add `description` and `license` fields to make Cloud OpenAPI spec
linter happy
---
 safekeeper/src/http/openapi_spec.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/safekeeper/src/http/openapi_spec.yaml b/safekeeper/src/http/openapi_spec.yaml
index 70999853c2aa..3f1407534568 100644
--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -1,7 +1,11 @@
 openapi: "3.0.2"
 info:
   title: Safekeeper control API
+  description: Neon Safekeeper API
   version: "1.0"
+  license:
+    name: "Apache"
+    url: https://github.com/neondatabase/neon/blob/main/LICENSE
 
 
 servers:
@@ -386,6 +390,12 @@ components:
         msg:
           type: string
 
+    NotFoundError:
+      type: object
+      properties:
+        msg:
+          type: string
+
   responses:
 
     #

From c45b56e0bb34b70c770eb23b6d13156a0d4f9913 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 20 Sep 2024 15:55:50 +0200
Subject: [PATCH 19/77] pageserver: add counters for started smgr/getpage
 requests (#9069)

After this PR

```
curl localhost:9898/metrics | grep smgr_ | grep start
```

```
pageserver_smgr_query_started_count{shard_id="0000",smgr_query_type="get_page_at_lsn",tenant_id="...",timeline_id="..."} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_db_size"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_page_at_lsn"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_rel_exists"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_rel_size"} 0
pageserver_smgr_query_started_global_count{smgr_query_type="get_slru_segment"} 0
```

We instantiate the per-tenant counter only for `get_page_at_lsn`.
---
 pageserver/src/metrics.rs       | 102 +++++++++++++++++++++++++-------
 test_runner/fixtures/metrics.py |   2 +
 2 files changed, 81 insertions(+), 23 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 72229d80be8a..abd814f92864 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1177,10 +1177,10 @@ pub(crate) mod virtual_file_io_engine {
 }
 
 struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_metric: &'a Histogram,
+    global_latency_histo: &'a Histogram,
 
     // Optional because not all op types are tracked per-timeline
-    timeline_metric: Option<&'a Histogram>,
+    per_timeline_latency_histo: Option<&'a Histogram>,
 
     ctx: &'c RequestContext,
     start: std::time::Instant,
@@ -1212,9 +1212,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                 elapsed
             }
         };
-        self.global_metric.observe(ex_throttled.as_secs_f64());
-        if let Some(timeline_metric) = self.timeline_metric {
-            timeline_metric.observe(ex_throttled.as_secs_f64());
+        self.global_latency_histo
+            .observe(ex_throttled.as_secs_f64());
+        if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
+            per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
         }
     }
 }
@@ -1240,10 +1241,32 @@ pub enum SmgrQueryType {
 
 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    global_metrics: [Histogram; SmgrQueryType::COUNT],
-    per_timeline_getpage: Histogram,
+    global_started: [IntCounter; SmgrQueryType::COUNT],
+    global_latency: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage_started: IntCounter,
+    per_timeline_getpage_latency: Histogram,
 }
 
+static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_global_count",
+        "Number of smgr queries started, aggregated by query type.",
+        &["smgr_query_type"],
+    )
+    .expect("failed to define a metric")
+});
+
+static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_smgr_query_started_count",
+        "Number of smgr queries started, aggregated by query type and tenant/timeline.",
+        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds",
@@ -1319,14 +1342,28 @@ impl SmgrQueryTimePerTimeline {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_slug = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
-        let global_metrics = std::array::from_fn(|i| {
+        let global_started = std::array::from_fn(|i| {
+            let op = SmgrQueryType::from_repr(i).unwrap();
+            SMGR_QUERY_STARTED_GLOBAL
+                .get_metric_with_label_values(&[op.into()])
+                .unwrap()
+        });
+        let global_latency = std::array::from_fn(|i| {
             let op = SmgrQueryType::from_repr(i).unwrap();
             SMGR_QUERY_TIME_GLOBAL
                 .get_metric_with_label_values(&[op.into()])
                 .unwrap()
         });
 
-        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+        let per_timeline_getpage_started = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+        let per_timeline_getpage_latency = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
             .get_metric_with_label_values(&[
                 SmgrQueryType::GetPageAtLsn.into(),
                 &tenant_id,
@@ -1334,9 +1371,12 @@ impl SmgrQueryTimePerTimeline {
                 &timeline_id,
             ])
             .unwrap();
+
         Self {
-            global_metrics,
-            per_timeline_getpage,
+            global_started,
+            global_latency,
+            per_timeline_getpage_latency,
+            per_timeline_getpage_started,
         }
     }
     pub(crate) fn start_timer<'c: 'a, 'a>(
@@ -1344,8 +1384,11 @@ impl SmgrQueryTimePerTimeline {
         op: SmgrQueryType,
         ctx: &'c RequestContext,
     ) -> Option<impl Drop + '_> {
-        let global_metric = &self.global_metrics[op as usize];
         let start = Instant::now();
+
+        self.global_started[op as usize].inc();
+
+        // We subtract time spent throttled from the observed latency.
         match ctx.micros_spent_throttled.open() {
             Ok(()) => (),
             Err(error) => {
@@ -1364,15 +1407,16 @@ impl SmgrQueryTimePerTimeline {
             }
         }
 
-        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
-            Some(&self.per_timeline_getpage)
+        let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            self.per_timeline_getpage_started.inc();
+            Some(&self.per_timeline_getpage_latency)
         } else {
             None
         };
 
         Some(GlobalAndPerTimelineHistogramTimer {
-            global_metric,
-            timeline_metric,
+            global_latency_histo: &self.global_latency[op as usize],
+            per_timeline_latency_histo,
             ctx,
             start,
             op,
@@ -1423,9 +1467,12 @@ mod smgr_query_time_tests {
             let get_counts = || {
                 let global: u64 = ops
                     .iter()
-                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
+                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
                     .sum();
-                (global, metrics.per_timeline_getpage.get_sample_count())
+                (
+                    global,
+                    metrics.per_timeline_getpage_latency.get_sample_count(),
+                )
             };
 
             let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -2576,6 +2623,12 @@ impl TimelineMetrics {
             let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
         }
 
+        let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
         let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
             SmgrQueryType::GetPageAtLsn.into(),
             tenant_id,
@@ -3227,11 +3280,14 @@ pub fn preinitialize_metrics() {
     }
 
     // countervecs
-    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
-        .into_iter()
-        .for_each(|c| {
-            Lazy::force(c);
-        });
+    [
+        &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
+        &SMGR_QUERY_STARTED_GLOBAL,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });
 
     // gauges
     WALRECEIVER_ACTIVE_MANAGERS.get();
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index cda70be8dab6..d2db40897e25 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -132,6 +132,7 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
     *histogram("pageserver_wait_lsn_seconds"),
     *histogram("pageserver_remote_operation_seconds"),
     *histogram("pageserver_io_operations_seconds"),
+    "pageserver_smgr_query_started_global_count_total",
     "pageserver_tenant_states_count",
     "pageserver_circuit_breaker_broken_total",
     "pageserver_circuit_breaker_unbroken_total",
@@ -146,6 +147,7 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
+    "pageserver_smgr_query_started_count_total",
     "pageserver_archive_size",
     "pageserver_pitr_history_size",
     "pageserver_layer_bytes",

From 797aa4ffaaacba20a1b27344d28d1c1a0c287e8c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 20 Sep 2024 17:22:58 +0300
Subject: [PATCH 20/77] Skip running clippy in --release mode. (#9073)

It's pretty expensive to run, and there is very little difference
between debug and release builds that could lead to different clippy
warnings.

This is extracted from PR #8912. That PR wandered off into various
improvements we could make, but we seem to have consensus on this part
at least.
---
 .github/workflows/build_and_test.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a210c962cba1..c1ec3f207b68 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -159,6 +159,10 @@ jobs:
       # This will catch compiler & clippy warnings in all feature combinations.
       # TODO: use cargo hack for build and test as well, but, that's quite expensive.
       # NB: keep clippy args in sync with ./run_clippy.sh
+      #
+      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
+      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
+      # time just for that, so skip "clippy --release".
       - run: |
           CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
           if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
@@ -168,8 +172,6 @@ jobs:
           echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
       - name: Run cargo clippy (debug)
         run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
-      - name: Run cargo clippy (release)
-        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
 
       - name: Check documentation generation
         run: cargo doc --workspace --no-deps --document-private-items

From 6b9323027085c771ad71686b39f43f19f1a702f6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 20 Sep 2024 10:37:28 -0400
Subject: [PATCH 21/77] fix(pageserver): receive body error now 500 (#9052)

close https://github.com/neondatabase/neon/issues/8903

In https://github.com/neondatabase/neon/issues/8903 we observed JSON
decoding error to have the following error message in the log:

```
Error processing HTTP request: Resource temporarily unavailable: 3956 (pageserver-6.ap-southeast-1.aws.neon.tech) error receiving body: error decoding response body
```

This is hard to understand. In this patch, we make the error message
more reasonable.

## Summary of changes

* receive body error is now an internal server error, passthrough the
`reqwest::Error` (only decoding error) as `anyhow::Error`.
* instead of formatting the error using `to_string`, we use the
alternative `anyhow::Error` formatting, so that it prints out the cause
of the error (i.e., what exactly cannot serde decode).

I would expect seeing something like `error receiving body: error
decoding response body: XXX field not found` after this patch, though I
didn't set up a testing environment to observe the exact behavior.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/utils/src/http/error.rs      |  2 +-
 storage_controller/src/service.rs | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 3d863a6518d0..5e05e4e713d0 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -82,7 +82,7 @@ impl ApiError {
                 StatusCode::INTERNAL_SERVER_ERROR,
             ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
-                err.to_string(),
+                format!("{err:#}"), // use alternative formatting so that we give the cause without backtrace
                 StatusCode::INTERNAL_SERVER_ERROR,
             ),
         }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index be3efaf688a8..957f633feb41 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3,6 +3,7 @@ use std::{
     borrow::Cow,
     cmp::Ordering,
     collections::{BTreeMap, HashMap, HashSet},
+    error::Error,
     ops::Deref,
     path::PathBuf,
     str::FromStr,
@@ -218,9 +219,16 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
                 format!("{node} error receiving error body: {str}").into(),
             )
         }
-        mgmt_api::Error::ReceiveBody(str) => {
-            // Presume errors receiving body are connectivity/availability issues
-            ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into())
+        mgmt_api::Error::ReceiveBody(err) if err.is_decode() => {
+            // Return 500 for decoding errors.
+            ApiError::InternalServerError(anyhow::Error::from(err).context("error decoding body"))
+        }
+        mgmt_api::Error::ReceiveBody(err) => {
+            // Presume errors receiving body are connectivity/availability issues except for decoding errors
+            let src_str = err.source().map(|e| e.to_string()).unwrap_or_default();
+            ApiError::ResourceUnavailable(
+                format!("{node} error receiving error body: {err} {}", src_str).into(),
+            )
         }
         mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => {
             ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into())

From e675a21346812b109eddc86c5729ec83e25f845d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 20 Sep 2024 16:09:39 +0100
Subject: [PATCH 22/77] utils: leaky bucket should only report throttled if the
 notify queue is blocked on sleep (#9072)

## Problem

Seems that PS might be too eager in reporting throttled tasks

## Summary of changes

Introduce a sleep counter. If the sleep counter increases, then the
acquire tasks was throttled.
---
 libs/utils/src/leaky_bucket.rs | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs
index a120dc0ac566..0cc58738c09c 100644
--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -21,7 +21,13 @@
 //!
 //! Another explaination can be found here: <https://brandur.org/rate-limiting>
 
-use std::{sync::Mutex, time::Duration};
+use std::{
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Mutex,
+    },
+    time::Duration,
+};
 
 use tokio::{sync::Notify, time::Instant};
 
@@ -128,6 +134,7 @@ impl LeakyBucketState {
 
 pub struct RateLimiter {
     pub config: LeakyBucketConfig,
+    pub sleep_counter: AtomicU64,
     pub state: Mutex<LeakyBucketState>,
     /// a queue to provide this fair ordering.
     pub queue: Notify,
@@ -144,6 +151,7 @@ impl Drop for Requeue<'_> {
 impl RateLimiter {
     pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
         RateLimiter {
+            sleep_counter: AtomicU64::new(0),
             state: Mutex::new(LeakyBucketState::with_initial_tokens(
                 &config,
                 initial_tokens,
@@ -163,15 +171,16 @@ impl RateLimiter {
 
     /// returns true if we did throttle
     pub async fn acquire(&self, count: usize) -> bool {
-        let mut throttled = false;
-
         let start = tokio::time::Instant::now();
 
+        let start_count = self.sleep_counter.load(Ordering::Acquire);
+        let mut end_count = start_count;
+
         // wait until we are the first in the queue
         let mut notified = std::pin::pin!(self.queue.notified());
         if !notified.as_mut().enable() {
-            throttled = true;
             notified.await;
+            end_count = self.sleep_counter.load(Ordering::Acquire);
         }
 
         // notify the next waiter in the queue when we are done.
@@ -184,9 +193,22 @@ impl RateLimiter {
                 .unwrap()
                 .add_tokens(&self.config, start, count as f64);
             match res {
-                Ok(()) => return throttled,
+                Ok(()) => return end_count > start_count,
                 Err(ready_at) => {
-                    throttled = true;
+                    struct Increment<'a>(&'a AtomicU64);
+
+                    impl Drop for Increment<'_> {
+                        fn drop(&mut self) {
+                            self.0.fetch_add(1, Ordering::AcqRel);
+                        }
+                    }
+
+                    // increment the counter after we finish sleeping (or cancel this task).
+                    // this ensures that tasks that have already started the acquire will observe
+                    // the new sleep count when they are allowed to resume on the notify.
+                    let _inc = Increment(&self.sleep_counter);
+                    end_count += 1;
+
                     tokio::time::sleep_until(ready_at).await;
                 }
             }

From 6014f15157b3a47f122b814b3cf109e9d1851abd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 20 Sep 2024 17:07:09 +0100
Subject: [PATCH 23/77] pageserver: suppress noisy "layer became visible" logs
 (#9064)

## Problem

When layer visibility was added, an info log was included for the
situation where actual access to a layer disagrees with the visibility
calculation. This situation is safe, but I was interested in seeing when
it happens.

The log is pretty high volume, so this PR refines it to fire less often.

## Summary of changes

- For cases where accessing non-visible layers is normal, don't log at
all.
- Extend a unit test to increase confidence that the updates to
visibility on access are working as expected
- During compaction, only call the visibility calculation routine if
some image layers were created: previously, frequent calls resulted in
the visibility of layers getting reset every time we passed through
create_image_layers.
---
 pageserver/src/tenant/storage_layer/layer.rs  | 29 +++++++++++++++----
 .../src/tenant/storage_layer/layer/tests.rs   |  9 ++++++
 pageserver/src/tenant/timeline.rs             |  4 ++-
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b15cd4da39e9..f0e2ca5c8332 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -439,11 +439,30 @@ impl Layer {
 
     fn record_access(&self, ctx: &RequestContext) {
         if self.0.access_stats.record_access(ctx) {
-            // Visibility was modified to Visible
-            tracing::info!(
-                "Layer {} became visible as a result of access",
-                self.0.desc.key()
-            );
+            // Visibility was modified to Visible: maybe log about this
+            match ctx.task_kind() {
+                TaskKind::CalculateSyntheticSize
+                | TaskKind::GarbageCollector
+                | TaskKind::MgmtRequest => {
+                    // This situation is expected in code paths do binary searches of the LSN space to resolve
+                    // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
+                    // and on-demand for certain HTTP API requests.
+                }
+                _ => {
+                    // In all other contexts, it is unusual to do I/O involving layers which are not visible at
+                    // some branch tip, so we log the fact that we are accessing something that the visibility
+                    // calculation thought should not be visible.
+                    //
+                    // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
+                    // which was covered by a concurrent compaction.
+                    tracing::info!(
+                        "Layer {} became visible as a result of access",
+                        self.0.desc.key()
+                    );
+                }
+            }
+
+            // Update the timeline's visible bytes count
             if let Some(tl) = self.0.timeline.upgrade() {
                 tl.metrics
                     .visible_physical_size_gauge
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 0b9bde4f57e1..9de70f14eebb 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1025,6 +1025,15 @@ fn access_stats() {
     assert_eq!(access_stats.latest_activity(), lowres_time(atime));
     access_stats.set_visibility(LayerVisibilityHint::Visible);
     assert_eq!(access_stats.latest_activity(), lowres_time(atime));
+
+    // Recording access implicitly makes layer visible, if it wasn't already
+    let atime = UNIX_EPOCH + Duration::from_secs(2200000000);
+    access_stats.set_visibility(LayerVisibilityHint::Covered);
+    assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
+    assert!(access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
+    assert!(!access_stats.record_access_at(atime));
+    access_stats.set_visibility(LayerVisibilityHint::Visible);
 }
 
 #[test]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f08f5caf95f7..114a6dd4684e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4316,7 +4316,9 @@ impl Timeline {
         timer.stop_and_record();
 
         // Creating image layers may have caused some previously visible layers to be covered
-        self.update_layer_visibility().await?;
+        if !image_layers.is_empty() {
+            self.update_layer_visibility().await?;
+        }
 
         Ok(image_layers)
     }

From ec5dce04ebfa51b727dfc9bc04ebb1e68aef6434 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 20 Sep 2024 18:48:26 +0200
Subject: [PATCH 24/77] pageserver: throttling: per-tenant metrics + more
 metrics to help understand throttle queue depth (#9077)

---
 pageserver/src/metrics.rs         | 190 ++++++++++++++++++++++++++----
 pageserver/src/tenant.rs          |   4 +-
 pageserver/src/tenant/tasks.rs    |  68 ++++++-----
 pageserver/src/tenant/throttle.rs |  34 ++++--
 pageserver/src/tenant/timeline.rs |  10 +-
 test_runner/fixtures/metrics.py   |  13 ++
 6 files changed, 246 insertions(+), 73 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index abd814f92864..078d12f9342a 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2645,6 +2645,8 @@ pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
         let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
     }
 
+    tenant_throttling::remove_tenant_metrics(tenant_shard_id);
+
     // we leave the BROKEN_TENANTS_SET entry if any
 }
 
@@ -3108,41 +3110,180 @@ pub mod tokio_epoll_uring {
 pub(crate) mod tenant_throttling {
     use metrics::{register_int_counter_vec, IntCounter};
     use once_cell::sync::Lazy;
+    use utils::shard::TenantShardId;
 
     use crate::tenant::{self, throttle::Metric};
 
+    struct GlobalAndPerTenantIntCounter {
+        global: IntCounter,
+        per_tenant: IntCounter,
+    }
+
+    impl GlobalAndPerTenantIntCounter {
+        #[inline(always)]
+        pub(crate) fn inc(&self) {
+            self.inc_by(1)
+        }
+        #[inline(always)]
+        pub(crate) fn inc_by(&self, n: u64) {
+            self.global.inc_by(n);
+            self.per_tenant.inc_by(n);
+        }
+    }
+
     pub(crate) struct TimelineGet {
-        wait_time: IntCounter,
-        count: IntCounter,
+        count_accounted_start: GlobalAndPerTenantIntCounter,
+        count_accounted_finish: GlobalAndPerTenantIntCounter,
+        wait_time: GlobalAndPerTenantIntCounter,
+        count_throttled: GlobalAndPerTenantIntCounter,
     }
 
-    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
-        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
+    static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start_global",
+            "Count of tenant throttling starts, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_START_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_start",
+            "Count of tenant throttling starts, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish_global",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static COUNT_ACCOUNTED_FINISH_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_accounted_finish",
+            "Count of tenant throttling finishes, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
+    static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
             "pageserver_tenant_throttling_wait_usecs_sum_global",
-            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
             &["kind"]
         )
-            .unwrap()
-        });
+        .unwrap()
+    });
+    static WAIT_USECS_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum",
+            "Sum of microseconds that spent waiting throttle by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
 
-        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-            register_int_counter_vec!(
-                "pageserver_tenant_throttling_count_global",
-                "Count of tenant throttlings, by kind of throttle.",
-                &["kind"]
-            )
-            .unwrap()
-        });
+    static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count_global",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind"]
+        )
+        .unwrap()
+    });
+    static WAIT_COUNT_PER_TENANT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_tenant_throttling_count",
+            "Count of tenant throttlings, by kind of throttle.",
+            &["kind", "tenant_id", "shard_id"]
+        )
+        .unwrap()
+    });
 
-        let kind = "timeline_get";
-        TimelineGet {
-            wait_time: WAIT_USECS.with_label_values(&[kind]),
-            count: WAIT_COUNT.with_label_values(&[kind]),
+    const KIND: &str = "timeline_get";
+
+    impl TimelineGet {
+        pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
+            TimelineGet {
+                count_accounted_start: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                count_accounted_finish: {
+                    GlobalAndPerTenantIntCounter {
+                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
+                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                wait_time: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_USECS.with_label_values(&[KIND]),
+                        per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+                count_throttled: {
+                    GlobalAndPerTenantIntCounter {
+                        global: WAIT_COUNT.with_label_values(&[KIND]),
+                        per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
+                            KIND,
+                            &tenant_shard_id.tenant_id.to_string(),
+                            &tenant_shard_id.shard_slug().to_string(),
+                        ]),
+                    }
+                },
+            }
         }
-    });
+    }
+
+    pub(crate) fn preinitialize_global_metrics() {
+        Lazy::force(&COUNT_ACCOUNTED_START);
+        Lazy::force(&COUNT_ACCOUNTED_FINISH);
+        Lazy::force(&WAIT_USECS);
+        Lazy::force(&WAIT_COUNT);
+    }
+
+    pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+        for m in &[
+            &COUNT_ACCOUNTED_START_PER_TENANT,
+            &COUNT_ACCOUNTED_FINISH_PER_TENANT,
+            &WAIT_USECS_PER_TENANT,
+            &WAIT_COUNT_PER_TENANT,
+        ] {
+            let _ = m.remove_label_values(&[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ]);
+        }
+    }
 
-    impl Metric for &'static TimelineGet {
+    impl Metric for TimelineGet {
+        #[inline(always)]
+        fn accounting_start(&self) {
+            self.count_accounted_start.inc();
+        }
+        #[inline(always)]
+        fn accounting_finish(&self) {
+            self.count_accounted_finish.inc();
+        }
         #[inline(always)]
         fn observe_throttling(
             &self,
@@ -3150,7 +3291,7 @@ pub(crate) mod tenant_throttling {
         ) {
             let val = u64::try_from(wait_time.as_micros()).unwrap();
             self.wait_time.inc_by(val);
-            self.count.inc();
+            self.count_throttled.inc();
         }
     }
 }
@@ -3309,7 +3450,8 @@ pub fn preinitialize_metrics() {
 
     // Custom
     Lazy::force(&RECONSTRUCT_TIME);
-    Lazy::force(&tenant_throttling::TIMELINE_GET);
     Lazy::force(&BASEBACKUP_QUERY_TIME);
     Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
+
+    tenant_throttling::preinitialize_global_metrics();
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d699d560751e..e328cd2044ee 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -302,7 +302,7 @@ pub struct Tenant {
     /// Throttle applied at the top of [`Timeline::get`].
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
     pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
     /// An ongoing timeline detach concurrency limiter.
     ///
@@ -2831,7 +2831,7 @@ impl Tenant {
             gate: Gate::default(),
             timeline_get_throttle: Arc::new(throttle::Throttle::new(
                 Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                &crate::metrics::tenant_throttling::TIMELINE_GET,
+                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 57f0123d8fa3..341febb30ab9 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -163,8 +163,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     // How many errors we have seen consequtively
     let mut error_run_count = 0;
 
-    let mut last_throttle_flag_reset_at = Instant::now();
-
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -191,8 +189,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             }
 
-
-
             let sleep_duration;
             if period == Duration::ZERO {
                 #[cfg(not(feature = "testing"))]
@@ -207,12 +203,18 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 };
 
                 // Run compaction
-                let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await;
+                let IterationResult { output, elapsed } = iteration
+                    .run(tenant.compaction_iteration(&cancel, &ctx))
+                    .await;
                 match output {
                     Ok(has_pending_task) => {
                         error_run_count = 0;
                         // schedule the next compaction immediately in case there is a pending compaction task
-                        sleep_duration = if has_pending_task { Duration::ZERO } else { period };
+                        sleep_duration = if has_pending_task {
+                            Duration::ZERO
+                        } else {
+                            period
+                        };
                     }
                     Err(e) => {
                         let wait_duration = backoff::exponential_backoff_duration_seconds(
@@ -233,38 +235,20 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
 
                 // the duration is recorded by performance tests by enabling debug in this function
-                tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete");
+                tracing::debug!(
+                    elapsed_ms = elapsed.as_millis(),
+                    "compaction iteration complete"
+                );
             };
 
-
             // Perhaps we did no work and the walredo process has been idle for some time:
             // give it a chance to shut down to avoid leaving walredo process running indefinitely.
+            // TODO: move this to a separate task (housekeeping loop) that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
             if let Some(walredo_mgr) = &tenant.walredo_mgr {
                 walredo_mgr.maybe_quiesce(period * 10);
             }
 
-            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
-            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
-                let now = Instant::now();
-                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
-                if count_throttled == 0 {
-                    return;
-                }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
-                let delta = now - prev;
-                info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
-                    count_accounted,
-                    count_throttled,
-                    sum_throttled_usecs,
-                    allowed_rps=%format_args!("{allowed_rps:.0}"),
-                    "shard was throttled in the last n_seconds"
-                );
-            });
-
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
@@ -437,6 +421,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+    let mut last_throttle_flag_reset_at = Instant::now();
         loop {
             tokio::select! {
                 _ = cancel.cancelled() => {
@@ -483,6 +468,29 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                 kind: BackgroundLoopKind::IngestHouseKeeping,
             };
             iteration.run(tenant.ingest_housekeeping()).await;
+
+            // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
+            // Or just spawn another background loop for this throttle, it's not like it's super costly.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                info!(
+                    n_seconds=%format_args!("{:.3}",
+                    delta.as_secs_f64()),
+                    count_accounted = count_accounted_finish,  // don't break existing log scraping
+                    count_throttled,
+                    sum_throttled_usecs,
+                    count_accounted_start, // log after pre-existing fields to not break existing log scraping
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds"
+                );
+            });
         }
     }
     .await;
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index f222e708e16b..6a8095390177 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -24,8 +24,10 @@ use crate::{context::RequestContext, task_mgr::TaskKind};
 pub struct Throttle<M: Metric> {
     inner: ArcSwap<Inner>,
     metric: M,
-    /// will be turned into [`Stats::count_accounted`]
-    count_accounted: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_start`]
+    count_accounted_start: AtomicU64,
+    /// will be turned into [`Stats::count_accounted_finish`]
+    count_accounted_finish: AtomicU64,
     /// will be turned into [`Stats::count_throttled`]
     count_throttled: AtomicU64,
     /// will be turned into [`Stats::sum_throttled_usecs`]
@@ -43,17 +45,21 @@ pub struct Observation {
     pub wait_time: Duration,
 }
 pub trait Metric {
+    fn accounting_start(&self);
+    fn accounting_finish(&self);
     fn observe_throttling(&self, observation: &Observation);
 }
 
 /// See [`Throttle::reset_stats`].
 pub struct Stats {
-    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
-    pub count_accounted: u64,
-    // Subset of the `accounted` requests that were actually throttled.
-    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    /// Number of requests that started [`Throttle::throttle`] calls.
+    pub count_accounted_start: u64,
+    /// Number of requests that finished [`Throttle::throttle`] calls.
+    pub count_accounted_finish: u64,
+    /// Subset of the `accounted` requests that were actually throttled.
+    /// Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
     pub count_throttled: u64,
-    // Sum of microseconds that throttled requests spent waiting for throttling.
+    /// Sum of microseconds that throttled requests spent waiting for throttling.
     pub sum_throttled_usecs: u64,
 }
 
@@ -65,7 +71,8 @@ where
         Self {
             inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
             metric,
-            count_accounted: AtomicU64::new(0),
+            count_accounted_start: AtomicU64::new(0),
+            count_accounted_finish: AtomicU64::new(0),
             count_throttled: AtomicU64::new(0),
             sum_throttled_usecs: AtomicU64::new(0),
         }
@@ -117,11 +124,13 @@ where
     /// This method allows retrieving & resetting that flag.
     /// Useful for periodic reporting.
     pub fn reset_stats(&self) -> Stats {
-        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
+        let count_accounted_start = self.count_accounted_start.swap(0, Ordering::Relaxed);
+        let count_accounted_finish = self.count_accounted_finish.swap(0, Ordering::Relaxed);
         let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
         let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
         Stats {
-            count_accounted,
+            count_accounted_start,
+            count_accounted_finish,
             count_throttled,
             sum_throttled_usecs,
         }
@@ -139,9 +148,12 @@ where
         };
         let start = std::time::Instant::now();
 
+        self.metric.accounting_start();
+        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
         let did_throttle = inner.rate_limiter.acquire(key_count).await;
+        self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
+        self.metric.accounting_finish();
 
-        self.count_accounted.fetch_add(1, Ordering::Relaxed);
         if did_throttle {
             self.count_throttled.fetch_add(1, Ordering::Relaxed);
             let now = Instant::now();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 114a6dd4684e..c98efd5f7184 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -196,9 +196,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    pub timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -406,9 +405,8 @@ pub struct Timeline {
     gc_lock: tokio::sync::Mutex<()>,
 
     /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle: Arc<
-        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
-    >,
+    timeline_get_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
     /// Keep aux directory cache to avoid it's reconstruction on each update
     pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index d2db40897e25..005dc6cb0d6c 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -102,6 +102,11 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
     return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
 
 
+def counter(name: str) -> str:
+    # the prometheus_client package appends _total to all counters client-side
+    return f"{name}_total"
+
+
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_timeline_client_calls_started_total",
     "pageserver_remote_timeline_client_calls_finished_total",
@@ -136,6 +141,10 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
     "pageserver_tenant_states_count",
     "pageserver_circuit_breaker_broken_total",
     "pageserver_circuit_breaker_unbroken_total",
+    counter("pageserver_tenant_throttling_count_accounted_start_global"),
+    counter("pageserver_tenant_throttling_count_accounted_finish_global"),
+    counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
+    counter("pageserver_tenant_throttling_count_global"),
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
@@ -159,6 +168,10 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
     "pageserver_evictions_with_low_residence_duration_total",
     "pageserver_aux_file_estimated_size",
     "pageserver_valid_lsn_lease_count",
+    counter("pageserver_tenant_throttling_count_accounted_start"),
+    counter("pageserver_tenant_throttling_count_accounted_finish"),
+    counter("pageserver_tenant_throttling_wait_usecs_sum"),
+    counter("pageserver_tenant_throttling_count"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken

From f03f7b38680f68245f2613c5b033ef25e634b73b Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 20 Sep 2024 18:24:40 +0100
Subject: [PATCH 25/77] Bump vendor/postgres to include extension path fix
 (#9076)

This is a pre requisite for
https://github.com/neondatabase/neon/pull/8681
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 87cb68f899db..f9c51c12438b 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 87cb68f899db434cd6f1908cf0ac8fdeafdd88c1
+Subproject commit f9c51c12438b20049b6905eb4e43d321defd6ff2
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 72b904c0b3ac..1dbd6f316416 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1
+Subproject commit 1dbd6f316416c8360bbd4f3d6db956cf70937cf0
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3ec6e2496f64..d009084a745c 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3ec6e2496f64c6fec35c67cb82efd6490a6a4738
+Subproject commit d009084a745cb4d5e6de222c778b2a562c8b2767
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 5bbb9bd93dd8..dadd6fe208bb 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 5bbb9bd93dd805e90bd8af15d00080363d18ec68
+Subproject commit dadd6fe208bb906cc0a48980f2ab4e13c47ba3ad
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 6289a53670b6..c93393970ffa 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "5bbb9bd93dd805e90bd8af15d00080363d18ec68"
+    "dadd6fe208bb906cc0a48980f2ab4e13c47ba3ad"
   ],
   "v16": [
     "16.4",
-    "3ec6e2496f64c6fec35c67cb82efd6490a6a4738"
+    "d009084a745cb4d5e6de222c778b2a562c8b2767"
   ],
   "v15": [
     "15.8",
-    "72b904c0b3ac43bd74d1e8e6d772e2c476ae25b1"
+    "1dbd6f316416c8360bbd4f3d6db956cf70937cf0"
   ],
   "v14": [
     "14.13",
-    "87cb68f899db434cd6f1908cf0ac8fdeafdd88c1"
+    "f9c51c12438b20049b6905eb4e43d321defd6ff2"
   ]
 }

From 9a32aa828d8f2b4ee5f84f81bb5cb3f6012bfeb5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 04:00:38 +0300
Subject: [PATCH 26/77] Fix init of WAL page header at startup (#8914)

If the primary is started at an LSN within the first of a 16 MB WAL
segment, the "long XLOG page header" at the beginning of the segment was
not initialized correctly. That has gone unnnoticed, because under
normal circumstances, nothing looks at the page header. The WAL that is
streamed to the safekeepers starts at the new record's LSN, not at the
beginning of the page, so that bogus page header didn't propagate
elsewhere, and a primary server doesn't normally read the WAL its
written. Which is good because the contents of the page would be bogus
anyway, as it wouldn't contain any of the records before the LSN where
the new record is written.

Except that in the following cases a primary does read its own WAL:

1. When there are two-phase transactions in prepared state at
checkpoint. The checkpointer reads the two-phase state from the
XLOG_XACT_PREPARE record, and writes it to a file in pg_twophase/.

2. Logical decoding reads the WAL starting from the replication slot's
restart LSN.

This PR fixes the problem with two-phase transactions. For that, it's
sufficient to initialize the page header correctly. The checkpointer
only needs to read XLOG_XACT_PREPARE records that were generated after
the server startup, so it's still OK that older WAL is missing / bogus.

I have not investigated if we have a problem with logical decoding,
however. Let's deal with that separately.

Special thanks to @Lzjing-1997, who independently found the same bug
and opened a PR to fix it, although I did not use that PR.
---
 test_runner/regress/test_twophase.py | 31 +++++++++++++++++++++++-----
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 vendor/postgres-v16                  |  2 +-
 vendor/postgres-v17                  |  2 +-
 vendor/revisions.json                |  8 +++----
 6 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py
index ebe65e7c29dc..75fab78d6e6c 100644
--- a/test_runner/regress/test_twophase.py
+++ b/test_runner/regress/test_twophase.py
@@ -8,6 +8,7 @@
     PgBin,
     fork_at_current_lsn,
     import_timeline_from_vanilla_postgres,
+    wait_for_wal_insert_lsn,
 )
 
 
@@ -22,11 +23,6 @@ def twophase_test_on_timeline(env: NeonEnv):
     conn = endpoint.connect()
     cur = conn.cursor()
 
-    # FIXME: Switch to the next WAL segment, to work around the bug fixed in
-    # https://github.com/neondatabase/neon/pull/8914.  When that is merged, this can be
-    # removed.
-    cur.execute("select pg_switch_wal()")
-
     cur.execute("CREATE TABLE foo (t text)")
 
     # Prepare a transaction that will insert a row
@@ -140,3 +136,28 @@ def test_twophase_nonzero_epoch(
     vanilla_pg.stop()  # don't need the original server anymore
 
     twophase_test_on_timeline(env)
+
+
+def test_twophase_at_wal_segment_start(neon_simple_env: NeonEnv):
+    """
+    Same as 'test_twophase' test, but the server is started at an LSN at the beginning
+    of a WAL segment. We had a bug where we didn't initialize the "long XLOG page header"
+    at the beginning of the segment correctly, which was detected when the checkpointer
+    tried to read the XLOG_XACT_PREPARE record from the WAL, if that record was on the
+    very first page of a WAL segment and the server was started up at that first page.
+    """
+    env = neon_simple_env
+    timeline_id = env.neon_cli.create_branch("test_twophase", "main")
+
+    endpoint = env.endpoints.create_start(
+        "test_twophase", config_lines=["max_prepared_transactions=5"]
+    )
+    endpoint.safe_psql("SELECT pg_switch_wal()")
+
+    # to avoid hitting https://github.com/neondatabase/neon/issues/9079, wait for the
+    # WAL to reach the pageserver.
+    wait_for_wal_insert_lsn(env, endpoint, env.initial_tenant, timeline_id)
+
+    endpoint.stop_and_destroy()
+
+    twophase_test_on_timeline(env)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index f9c51c12438b..a38d15f3233a 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit f9c51c12438b20049b6905eb4e43d321defd6ff2
+Subproject commit a38d15f3233a4c07f2bf3335fcbd874dd1f4e386
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 1dbd6f316416..16c3c6b64f14 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 1dbd6f316416c8360bbd4f3d6db956cf70937cf0
+Subproject commit 16c3c6b64f1420a367a2a9b2510f20d94f791af8
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index d009084a745c..1d7081a3b076 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit d009084a745cb4d5e6de222c778b2a562c8b2767
+Subproject commit 1d7081a3b076ddf5086e0b118d4329820e6a7427
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index dadd6fe208bb..2cf120e7393c 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit dadd6fe208bb906cc0a48980f2ab4e13c47ba3ad
+Subproject commit 2cf120e7393ca5f537c6a38b457585576dc035fc
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c93393970ffa..9f6512d03e85 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "dadd6fe208bb906cc0a48980f2ab4e13c47ba3ad"
+    "2cf120e7393ca5f537c6a38b457585576dc035fc"
   ],
   "v16": [
     "16.4",
-    "d009084a745cb4d5e6de222c778b2a562c8b2767"
+    "1d7081a3b076ddf5086e0b118d4329820e6a7427"
   ],
   "v15": [
     "15.8",
-    "1dbd6f316416c8360bbd4f3d6db956cf70937cf0"
+    "16c3c6b64f1420a367a2a9b2510f20d94f791af8"
   ],
   "v14": [
     "14.13",
-    "f9c51c12438b20049b6905eb4e43d321defd6ff2"
+    "a38d15f3233a4c07f2bf3335fcbd874dd1f4e386"
   ]
 }

From a3800dcb0cbe44678a4d823f324b951ca3a0d4f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 21 Sep 2024 14:36:41 +0200
Subject: [PATCH 27/77] Move load_timeline_metadata into separate function
 (#9080)

Moves the per-timeline code to load timeline metadata into a new
dedicated function called `load_timeline_metadata`. The old
`load_timeline_metadata` becomes `load_timelines_metadata`.

Split out of #8907

Part of #8088
---
 pageserver/src/tenant.rs | 74 +++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e328cd2044ee..be69f3d67f5d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,7 +18,6 @@ use camino::Utf8Path;
 use camino::Utf8PathBuf;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
-use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::AuxFilePolicy;
@@ -34,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
+use std::future::Future;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
@@ -1031,13 +1031,9 @@ impl Tenant {
         }
 
         Ok(TenantPreload {
-            timelines: Self::load_timeline_metadata(
-                self,
-                remote_timeline_ids,
-                remote_storage,
-                cancel,
-            )
-            .await?,
+            timelines: self
+                .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
+                .await?,
         })
     }
 
@@ -1303,7 +1299,7 @@ impl Tenant {
         .await
     }
 
-    async fn load_timeline_metadata(
+    async fn load_timelines_metadata(
         self: &Arc<Tenant>,
         timeline_ids: HashSet<TimelineId>,
         remote_storage: &GenericRemoteStorage,
@@ -1311,33 +1307,10 @@ impl Tenant {
     ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
         let mut part_downloads = JoinSet::new();
         for timeline_id in timeline_ids {
-            let client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_shard_id,
-                timeline_id,
-                self.generation,
-            );
             let cancel_clone = cancel.clone();
             part_downloads.spawn(
-                async move {
-                    debug!("starting index part download");
-
-                    let index_part = client.download_index_file(&cancel_clone).await;
-
-                    debug!("finished index part download");
-
-                    Result::<_, anyhow::Error>::Ok(TimelinePreload {
-                        client,
-                        timeline_id,
-                        index_part,
-                    })
-                }
-                .map(move |res| {
-                    res.with_context(|| format!("download index part for timeline {timeline_id}"))
-                })
-                .instrument(info_span!("download_index_part", %timeline_id)),
+                self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
+                    .instrument(info_span!("download_index_part", %timeline_id)),
             );
         }
 
@@ -1348,8 +1321,7 @@ impl Tenant {
                 next = part_downloads.join_next() => {
                     match next {
                         Some(result) => {
-                            let preload_result = result.context("join preload task")?;
-                            let preload = preload_result?;
+                            let preload = result.context("join preload task")?;
                             timeline_preloads.insert(preload.timeline_id, preload);
                         },
                         None => {
@@ -1366,6 +1338,36 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
+    fn load_timeline_metadata(
+        self: &Arc<Tenant>,
+        timeline_id: TimelineId,
+        remote_storage: GenericRemoteStorage,
+        cancel: CancellationToken,
+    ) -> impl Future<Output = TimelinePreload> {
+        let client = RemoteTimelineClient::new(
+            remote_storage.clone(),
+            self.deletion_queue_client.clone(),
+            self.conf,
+            self.tenant_shard_id,
+            timeline_id,
+            self.generation,
+        );
+        async move {
+            debug_assert_current_span_has_tenant_and_timeline_id();
+            debug!("starting index part download");
+
+            let index_part = client.download_index_file(&cancel).await;
+
+            debug!("finished index part download");
+
+            TimelinePreload {
+                client,
+                timeline_id,
+                index_part,
+            }
+        }
+    }
+
     pub(crate) async fn apply_timeline_archival_config(
         &self,
         timeline_id: TimelineId,

From c9b2ec9ff1937e8a9465f2b4abb4d1a91a059ea7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sun, 22 Sep 2024 21:46:53 +0300
Subject: [PATCH 28/77] Check submodule forward progress (#8949)

We frequently mess up our submodule references. This adds one safeguard:
it checks that the submodule references are only updated "forwards", not
to some older commit, or a commit that's not a descended of the previous
one.

As next step, I'm thinking that we should automate things so that when
you merge a PR to the 'neon' repository that updates the submodule
references, the REL_*_STABLE_neon branches are automatically updated to
match the submodule references. That way, you never need to manually
merge PRs in the postgres repository, it's all triggered from commits in
the 'neon' repository. But that's not included here.
---
 .github/workflows/build_and_test.yml | 54 ++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c1ec3f207b68..6617ca42bb54 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -120,6 +120,59 @@ jobs:
       - name: Run mypy to check types
         run: poetry run mypy .
 
+  # Check that the vendor/postgres-* submodules point to the
+  # corresponding REL_*_STABLE_neon branches.
+  check-submodules:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - uses: dorny/paths-filter@v3
+        id: check-if-submodules-changed
+        with:
+          filters: |
+            vendor:
+              - 'vendor/**'
+
+      - name: Check vendor/postgres-v14 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v14"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v15 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v15"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v16 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v16"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
+      - name: Check vendor/postgres-v17 submodule reference
+        if: steps.check-if-submodules-changed.outputs.vendor == 'true'
+        uses: jtmullen/submodule-branch-check-action@v1
+        with:
+          path: "vendor/postgres-v17"
+          fetch_depth: "50"
+          sub_fetch_depth: "50"
+          pass_if_unchanged: true
+
   check-codestyle-rust:
     needs: [ check-permissions, build-build-tools-image ]
     strategy:
@@ -1154,6 +1207,7 @@ jobs:
     # Usually we do `needs: [...]`
     needs:
       - build-and-test-locally
+      - check-submodules
       - check-codestyle-python
       - check-codestyle-rust
       - promote-images

From ecd615ab6d45354d781e01f9247da2378f27b91c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 00:46:56 +0300
Subject: [PATCH 29/77] Update "hostname" crate

We were already building v0.4.0 as an indirect dependency, so this
avoids having to build two different versions of it.
---
 Cargo.lock | 21 ++-------------------
 Cargo.toml |  2 +-
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 136f07956f6f..7d3b8f2a04c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2411,17 +2411,6 @@ dependencies = [
  "digest",
 ]
 
-[[package]]
-name = "hostname"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867"
-dependencies = [
- "libc",
- "match_cfg",
- "winapi",
-]
-
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2974,12 +2963,6 @@ dependencies = [
  "hashbrown 0.14.5",
 ]
 
-[[package]]
-name = "match_cfg"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4"
-
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -4350,7 +4333,7 @@ dependencies = [
  "hashlink",
  "hex",
  "hmac",
- "hostname 0.3.1",
+ "hostname",
  "http 1.1.0",
  "http-body-util",
  "humantime",
@@ -5400,7 +5383,7 @@ version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
 dependencies = [
- "hostname 0.4.0",
+ "hostname",
  "libc",
  "os_info",
  "rustc_version",
diff --git a/Cargo.toml b/Cargo.toml
index fd1d4e016cc4..1871be7f2387 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -95,7 +95,7 @@ hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
-hostname = "0.3.1"
+hostname = "0.4"
 http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"

From 913af442195313af7b43559da1afbe896f0886c4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 00:47:00 +0300
Subject: [PATCH 30/77] Update "memoffset" crate

To eliminate one version of it from our dependency tree.
---
 Cargo.lock | 11 +----------
 Cargo.toml |  2 +-
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7d3b8f2a04c6..bd162f09dc5c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3055,15 +3055,6 @@ dependencies = [
  "autocfg",
 ]
 
-[[package]]
-name = "memoffset"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
-dependencies = [
- "autocfg",
-]
-
 [[package]]
 name = "memoffset"
 version = "0.9.0"
@@ -4123,7 +4114,7 @@ dependencies = [
  "crc32c",
  "env_logger",
  "log",
- "memoffset 0.8.0",
+ "memoffset 0.9.0",
  "once_cell",
  "postgres",
  "regex",
diff --git a/Cargo.toml b/Cargo.toml
index 1871be7f2387..b7f06b229616 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -113,7 +113,7 @@ libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
-memoffset = "0.8"
+memoffset = "0.9"
 nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"

From 9f653893b9b57236fa184b08594c1f70c7222537 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 00:47:02 +0300
Subject: [PATCH 31/77] Update a few dependencies, removing some indirect
 dependencies

    cargo update ciborium iana-time-zone lazy_static schannel uuid
    cargo update hyper@0.14
    cargo update  --precise 2.9.7 ureq

It might be worthwhile just update all our dependencies at some point,
but this is aimed at pruning the dependency tree, to make the build a
little faster. That's also why I didn't update ureq to the latest
version: that would've added a dependency to yet another version of
rustls.
---
 Cargo.lock                | 275 ++++++++++----------------------------
 workspace_hack/Cargo.toml |   2 +
 2 files changed, 73 insertions(+), 204 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bd162f09dc5c..e4dbd8b33398 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -255,12 +255,6 @@ dependencies = [
  "syn 2.0.52",
 ]
 
-[[package]]
-name = "atomic"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
-
 [[package]]
 name = "atomic-take"
 version = "1.1.0"
@@ -295,8 +289,8 @@ dependencies = [
  "fastrand 2.0.0",
  "hex",
  "http 0.2.9",
- "hyper 0.14.26",
- "ring 0.17.6",
+ "hyper 0.14.30",
+ "ring",
  "time",
  "tokio",
  "tracing",
@@ -486,7 +480,7 @@ dependencies = [
  "once_cell",
  "p256 0.11.1",
  "percent-encoding",
- "ring 0.17.6",
+ "ring",
  "sha2",
  "subtle",
  "time",
@@ -593,7 +587,7 @@ dependencies = [
  "http 0.2.9",
  "http-body 0.4.5",
  "http-body 1.0.0",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "hyper-rustls 0.24.0",
  "once_cell",
  "pin-project-lite",
@@ -684,7 +678,7 @@ dependencies = [
  "futures-util",
  "http 0.2.9",
  "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "itoa",
  "matchit 0.7.0",
  "memchr",
@@ -1089,9 +1083,9 @@ dependencies = [
 
 [[package]]
 name = "ciborium"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
 dependencies = [
  "ciborium-io",
  "ciborium-ll",
@@ -1100,18 +1094,18 @@ dependencies = [
 
 [[package]]
 name = "ciborium-io"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
 
 [[package]]
 name = "ciborium-ll"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
 dependencies = [
  "ciborium-io",
- "half 1.8.2",
+ "half",
 ]
 
 [[package]]
@@ -1224,7 +1218,7 @@ dependencies = [
  "compute_api",
  "flate2",
  "futures",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "nix 0.27.1",
  "notify",
  "num_cpus",
@@ -1330,7 +1324,7 @@ dependencies = [
  "git-version",
  "humantime",
  "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "nix 0.27.1",
  "once_cell",
  "pageserver_api",
@@ -2304,12 +2298,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "half"
-version = "1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
-
 [[package]]
 name = "half"
 version = "2.4.1"
@@ -2419,7 +2407,7 @@ checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba"
 dependencies = [
  "cfg-if",
  "libc",
- "windows 0.52.0",
+ "windows",
 ]
 
 [[package]]
@@ -2528,9 +2516,9 @@ dependencies = [
 
 [[package]]
 name = "hyper"
-version = "0.14.26"
+version = "0.14.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4"
+checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9"
 dependencies = [
  "bytes",
  "futures-channel",
@@ -2543,7 +2531,7 @@ dependencies = [
  "httpdate",
  "itoa",
  "pin-project-lite",
- "socket2 0.4.9",
+ "socket2",
  "tokio",
  "tower-service",
  "tracing",
@@ -2578,7 +2566,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
 dependencies = [
  "http 0.2.9",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "log",
  "rustls 0.21.11",
  "rustls-native-certs 0.6.2",
@@ -2609,7 +2597,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
 dependencies = [
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "pin-project-lite",
  "tokio",
  "tokio-io-timeout",
@@ -2628,7 +2616,7 @@ dependencies = [
  "http-body 1.0.0",
  "hyper 1.2.0",
  "pin-project-lite",
- "socket2 0.5.5",
+ "socket2",
  "tokio",
  "tower",
  "tower-service",
@@ -2637,16 +2625,16 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone"
-version = "0.1.56"
+version = "0.1.61"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c"
+checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
 dependencies = [
  "android_system_properties",
  "core-foundation-sys",
  "iana-time-zone-haiku",
  "js-sys",
  "wasm-bindgen",
- "windows 0.48.0",
+ "windows-core",
 ]
 
 [[package]]
@@ -2859,7 +2847,7 @@ dependencies = [
  "base64 0.21.1",
  "js-sys",
  "pem",
- "ring 0.17.6",
+ "ring",
  "serde",
  "serde_json",
  "simple_asn1",
@@ -2897,11 +2885,11 @@ dependencies = [
 
 [[package]]
 name = "lazy_static"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 dependencies = [
- "spin 0.5.2",
+ "spin",
 ]
 
 [[package]]
@@ -3634,7 +3622,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "indoc",
  "itertools 0.10.5",
  "md5",
@@ -3827,7 +3815,7 @@ dependencies = [
  "ahash",
  "bytes",
  "chrono",
- "half 2.4.1",
+ "half",
  "hashbrown 0.14.5",
  "num",
  "num-bigint",
@@ -4329,7 +4317,7 @@ dependencies = [
  "http-body-util",
  "humantime",
  "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "hyper 1.2.0",
  "hyper-util",
  "indexmap 2.0.1",
@@ -4374,7 +4362,7 @@ dependencies = [
  "signature 2.2.0",
  "smallvec",
  "smol_str",
- "socket2 0.5.5",
+ "socket2",
  "subtle",
  "thiserror",
  "tikv-jemalloc-ctl",
@@ -4552,7 +4540,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
 dependencies = [
  "pem",
- "ring 0.17.6",
+ "ring",
  "time",
  "yasna",
 ]
@@ -4576,7 +4564,7 @@ dependencies = [
  "rustls-pki-types",
  "ryu",
  "sha1_smol",
- "socket2 0.5.5",
+ "socket2",
  "tokio",
  "tokio-rustls 0.25.0",
  "tokio-util",
@@ -4688,7 +4676,7 @@ dependencies = [
  "futures-util",
  "http-types",
  "humantime-serde",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "itertools 0.10.5",
  "metrics",
  "once_cell",
@@ -4721,7 +4709,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "hyper-rustls 0.24.0",
  "ipnet",
  "js-sys",
@@ -4879,21 +4867,6 @@ dependencies = [
  "subtle",
 ]
 
-[[package]]
-name = "ring"
-version = "0.16.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
-dependencies = [
- "cc",
- "libc",
- "once_cell",
- "spin 0.5.2",
- "untrusted 0.7.1",
- "web-sys",
- "winapi",
-]
-
 [[package]]
 name = "ring"
 version = "0.17.6"
@@ -4903,8 +4876,8 @@ dependencies = [
  "cc",
  "getrandom 0.2.11",
  "libc",
- "spin 0.9.8",
- "untrusted 0.9.0",
+ "spin",
+ "untrusted",
  "windows-sys 0.48.0",
 ]
 
@@ -4924,7 +4897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945"
 dependencies = [
  "http 0.2.9",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "lazy_static",
  "percent-encoding",
  "regex",
@@ -5048,7 +5021,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
  "log",
- "ring 0.17.6",
+ "ring",
  "rustls-webpki 0.101.7",
  "sct",
 ]
@@ -5060,7 +5033,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
  "log",
- "ring 0.17.6",
+ "ring",
  "rustls-pki-types",
  "rustls-webpki 0.102.2",
  "subtle",
@@ -5117,24 +5090,14 @@ version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
 
-[[package]]
-name = "rustls-webpki"
-version = "0.100.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
-dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
-]
-
 [[package]]
 name = "rustls-webpki"
 version = "0.101.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]
 
 [[package]]
@@ -5143,9 +5106,9 @@ version = "0.102.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
 dependencies = [
- "ring 0.17.6",
+ "ring",
  "rustls-pki-types",
- "untrusted 0.9.0",
+ "untrusted",
 ]
 
 [[package]]
@@ -5179,7 +5142,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5236,11 +5199,11 @@ dependencies = [
 
 [[package]]
 name = "schannel"
-version = "0.1.21"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "713cfb06c7059f3588fb8044c0fad1d09e3c01d225e25b9220dbfdcf16dbb1b3"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "windows-sys 0.42.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -5264,8 +5227,8 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.17.6",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]
 
 [[package]]
@@ -5686,16 +5649,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "socket2"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662"
-dependencies = [
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "socket2"
 version = "0.5.5"
@@ -5706,12 +5659,6 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "spin"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
-
 [[package]]
 name = "spin"
 version = "0.9.8"
@@ -5757,7 +5704,7 @@ dependencies = [
  "futures-util",
  "git-version",
  "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
@@ -5786,7 +5733,7 @@ dependencies = [
  "git-version",
  "hex",
  "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "itertools 0.10.5",
  "lasso",
  "measured",
@@ -6202,7 +6149,7 @@ dependencies = [
  "num_cpus",
  "pin-project-lite",
  "signal-hook-registry",
- "socket2 0.5.5",
+ "socket2",
  "tokio-macros",
  "windows-sys 0.48.0",
 ]
@@ -6262,7 +6209,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "postgres-types",
- "socket2 0.5.5",
+ "socket2",
  "tokio",
  "tokio-util",
 ]
@@ -6274,7 +6221,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
  "futures",
- "ring 0.17.6",
+ "ring",
  "rustls 0.22.4",
  "tokio",
  "tokio-postgres",
@@ -6408,7 +6355,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
@@ -6585,7 +6532,7 @@ dependencies = [
 name = "tracing-utils"
 version = "0.1.0"
 dependencies = [
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "opentelemetry",
  "opentelemetry-otlp",
  "opentelemetry-semantic-conventions",
@@ -6688,12 +6635,6 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"
 
-[[package]]
-name = "untrusted"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
-
 [[package]]
 name = "untrusted"
 version = "0.9.0"
@@ -6702,17 +6643,18 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "ureq"
-version = "2.7.1"
+version = "2.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9"
+checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.22.1",
  "log",
  "once_cell",
- "rustls 0.21.11",
- "rustls-webpki 0.100.2",
+ "rustls 0.22.4",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.2",
  "url",
- "webpki-roots 0.23.1",
+ "webpki-roots 0.26.1",
 ]
 
 [[package]]
@@ -6776,7 +6718,7 @@ dependencies = [
  "hex",
  "hex-literal",
  "humantime",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
@@ -6811,11 +6753,10 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.6.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
+checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
 dependencies = [
- "atomic",
  "getrandom 0.2.11",
  "serde",
 ]
@@ -7049,15 +6990,6 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "webpki-roots"
-version = "0.23.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
-dependencies = [
- "rustls-webpki 0.100.2",
-]
-
 [[package]]
 name = "webpki-roots"
 version = "0.25.2"
@@ -7126,15 +7058,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
-[[package]]
-name = "windows"
-version = "0.48.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f"
-dependencies = [
- "windows-targets 0.48.0",
-]
-
 [[package]]
 name = "windows"
 version = "0.52.0"
@@ -7154,21 +7077,6 @@ dependencies = [
  "windows-targets 0.52.4",
 ]
 
-[[package]]
-name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm 0.42.2",
- "windows_aarch64_msvc 0.42.2",
- "windows_i686_gnu 0.42.2",
- "windows_i686_msvc 0.42.2",
- "windows_x86_64_gnu 0.42.2",
- "windows_x86_64_gnullvm 0.42.2",
- "windows_x86_64_msvc 0.42.2",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.48.0"
@@ -7217,12 +7125,6 @@ dependencies = [
  "windows_x86_64_msvc 0.52.4",
 ]
 
-[[package]]
-name = "windows_aarch64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
-
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.48.0"
@@ -7235,12 +7137,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
 
-[[package]]
-name = "windows_aarch64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
-
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.0"
@@ -7253,12 +7149,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
 
-[[package]]
-name = "windows_i686_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
-
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.0"
@@ -7271,12 +7161,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
 
-[[package]]
-name = "windows_i686_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
-
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.0"
@@ -7289,12 +7173,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
 
-[[package]]
-name = "windows_x86_64_gnu"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
-
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.0"
@@ -7307,12 +7185,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
 
-[[package]]
-name = "windows_x86_64_gnullvm"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
-
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.0"
@@ -7325,12 +7197,6 @@ version = "0.52.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
 
-[[package]]
-name = "windows_x86_64_msvc"
-version = "0.42.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
-
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.0"
@@ -7407,10 +7273,11 @@ dependencies = [
  "futures-util",
  "generic-array",
  "getrandom 0.2.11",
+ "half",
  "hashbrown 0.14.5",
  "hex",
  "hmac",
- "hyper 0.14.26",
+ "hyper 0.14.30",
  "indexmap 1.9.3",
  "itertools 0.10.5",
  "itertools 0.12.1",
@@ -7478,7 +7345,7 @@ dependencies = [
  "der 0.7.8",
  "hex",
  "pem",
- "ring 0.17.6",
+ "ring",
  "signature 2.2.0",
  "spki 0.7.3",
  "thiserror",
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 662916d42ca1..e6d21e9434ef 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -45,6 +45,7 @@ futures-io = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
+half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
@@ -106,6 +107,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
+half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] }

From e16e82749f52f623c092b2ed0dd205f50dd8cdb5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 21 Sep 2024 00:47:05 +0300
Subject: [PATCH 32/77] Remove unused crates from workspace Cargo.toml

These were not referenced in any of the other Cargo.toml files in the
workspace. They were not being built because of that, so there was
little harm in having them listed, but let's be tidy.
---
 Cargo.toml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index b7f06b229616..a788dcf3cbc2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -76,8 +76,6 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
-crossbeam-deque = "0.8.5"
-crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
@@ -104,7 +102,6 @@ hyper = "0.14"
 tokio-tungstenite = "0.20.0"
 indexmap = "2"
 indoc = "2"
-inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
@@ -142,7 +139,6 @@ rpds = "0.13"
 rustc-hash = "1.1.0"
 rustls = "0.22"
 rustls-pemfile = "2"
-rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
@@ -164,7 +160,6 @@ strum_macros = "0.26"
 svg_fmt = "0.4.3"
 sync_wrapper = "0.1.2"
 tar = "0.4"
-task-local-extensions = "0.1.4"
 test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"

From 5432155b0d161a332d6d8ec2933a875d9959e558 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 23 Sep 2024 10:05:02 +0100
Subject: [PATCH 33/77] storcon: update compute hook state on detach (#9045)

## Problem

Previously, the storage controller may send compute notifications
containing stale pageservers (i.e. pageserver serving the shard was
detached). This happened because detaches did not update the compute
hook state.

## Summary of Changes

Update compute hook state on shard detach.

Fixes #8928
---
 storage_controller/src/compute_hook.rs | 61 ++++++++++++++++++++++++++
 storage_controller/src/reconciler.rs   | 10 +++++
 2 files changed, 71 insertions(+)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index c46539485c1f..bafae1f5515b 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -71,6 +71,37 @@ impl ComputeHookTenant {
         }
     }
 
+    fn is_sharded(&self) -> bool {
+        matches!(self, ComputeHookTenant::Sharded(_))
+    }
+
+    /// Clear compute hook state for the specified shard.
+    /// Only valid for [`ComputeHookTenant::Sharded`] instances.
+    fn remove_shard(&mut self, tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize) {
+        match self {
+            ComputeHookTenant::Sharded(sharded) => {
+                if sharded.stripe_size != stripe_size
+                    || sharded.shard_count != tenant_shard_id.shard_count
+                {
+                    tracing::warn!("Shard split detected while handling detach")
+                }
+
+                let shard_idx = sharded.shards.iter().position(|(shard_number, _node_id)| {
+                    *shard_number == tenant_shard_id.shard_number
+                });
+
+                if let Some(shard_idx) = shard_idx {
+                    sharded.shards.remove(shard_idx);
+                } else {
+                    tracing::warn!("Shard not found while handling detach")
+                }
+            }
+            ComputeHookTenant::Unsharded(_) => {
+                unreachable!("Detach of unsharded tenants is handled externally");
+            }
+        }
+    }
+
     /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
     /// and drops existing content.
     fn update(
@@ -614,6 +645,36 @@ impl ComputeHook {
         self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
             .await
     }
+
+    /// Reflect a detach for a particular shard in the compute hook state.
+    ///
+    /// The goal is to avoid sending compute notifications with stale information (i.e.
+    /// including detach pageservers).
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
+    pub(super) fn handle_detach(
+        &self,
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+    ) {
+        use std::collections::hash_map::Entry;
+
+        let mut state_locked = self.state.lock().unwrap();
+        match state_locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(_) => {
+                tracing::warn!("Compute hook tenant not found for detach");
+            }
+            Entry::Occupied(mut e) => {
+                let sharded = e.get().is_sharded();
+                if !sharded {
+                    e.remove();
+                } else {
+                    e.get_mut().remove_shard(tenant_shard_id, stripe_size);
+                }
+
+                tracing::debug!("Compute hook handled shard detach");
+            }
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 83b7b2b4f2ac..750bcd7c0138 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -820,6 +820,16 @@ impl Reconciler {
             self.location_config(&node, conf, None, false).await?;
         }
 
+        // The condition below identifies a detach. We must have no attached intent and
+        // must have been attached to something previously. Pass this information to
+        // the [`ComputeHook`] such that it can update its tenant-wide state.
+        if self.intent.attached.is_none() && !self.detach.is_empty() {
+            // TODO: Consider notifying control plane about detaches. This would avoid situations
+            // where the compute tries to start-up with a stale set of pageservers.
+            self.compute_hook
+                .handle_detach(self.tenant_shard_id, self.shard.stripe_size);
+        }
+
         failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
 
         Ok(())

From 59b4c2eaf956eb17d6360cfa94c05c830f1b535a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 23 Sep 2024 12:19:37 +0200
Subject: [PATCH 34/77] walredo: add a ping method (#8952)

Not used in production, but in benchmarks, to demonstrate minimal RTT.
(It would be nice to not have to copy the 8KiB of zeroes, but, that
would require larger protocol changes).

Found this useful in investigation
https://github.com/neondatabase/neon/pull/8952.
---
 pageserver/benches/bench_walredo.rs        | 136 +++++++++++++--------
 pageserver/src/walredo.rs                  |  30 +++++
 pageserver/src/walredo/process.rs          |  21 ++++
 pageserver/src/walredo/process/protocol.rs |   5 +
 pgxn/neon_walredo/walredoproc.c            |  36 ++++++
 5 files changed, 176 insertions(+), 52 deletions(-)

diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index edc09d0bf22a..45936cb3fafa 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -1,7 +1,7 @@
 //! Quantify a single walredo manager's throughput under N concurrent callers.
 //!
 //! The benchmark implementation ([`bench_impl`]) is parametrized by
-//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
+//! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo
 //! - `n_redos` => number of times the benchmark shell execute the `redo_work`
 //! - `nclients` => number of clients (more on this shortly).
 //!
@@ -10,7 +10,7 @@
 //! Each task executes the `redo_work` `n_redos/nclients` times.
 //!
 //! We exercise the following combinations:
-//! - `redo_work = short / medium``
+//! - `redo_work = ping / short / medium``
 //! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
 //!
 //! We let `criterion` determine the `n_redos` using `iter_custom`.
@@ -27,33 +27,43 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-15 on i3en.3xlarge
+//! 2024-09-18 on im4gn.2xlarge
 //!
 //! ```text
-//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! ping/1                  time:   [21.789 µs 21.918 µs 22.078 µs]
+//! ping/2                  time:   [27.686 µs 27.812 µs 27.970 µs]
+//! ping/4                  time:   [35.468 µs 35.671 µs 35.926 µs]
+//! ping/8                  time:   [59.682 µs 59.987 µs 60.363 µs]
+//! ping/16                 time:   [101.79 µs 102.37 µs 103.08 µs]
+//! ping/32                 time:   [184.18 µs 185.15 µs 186.36 µs]
+//! ping/64                 time:   [349.86 µs 351.45 µs 353.47 µs]
+//! ping/128                time:   [684.53 µs 687.98 µs 692.17 µs]
+//! short/1                 time:   [31.833 µs 32.126 µs 32.428 µs]
+//! short/2                 time:   [35.558 µs 35.756 µs 35.992 µs]
+//! short/4                 time:   [44.850 µs 45.138 µs 45.484 µs]
+//! short/8                 time:   [65.985 µs 66.379 µs 66.853 µs]
+//! short/16                time:   [127.06 µs 127.90 µs 128.87 µs]
+//! short/32                time:   [252.98 µs 254.70 µs 256.73 µs]
+//! short/64                time:   [497.13 µs 499.86 µs 503.26 µs]
+//! short/128               time:   [987.46 µs 993.45 µs 1.0004 ms]
+//! medium/1                time:   [137.91 µs 138.55 µs 139.35 µs]
+//! medium/2                time:   [192.00 µs 192.91 µs 194.07 µs]
+//! medium/4                time:   [389.62 µs 391.55 µs 394.01 µs]
+//! medium/8                time:   [776.80 µs 780.33 µs 784.77 µs]
+//! medium/16               time:   [1.5323 ms 1.5383 ms 1.5459 ms]
+//! medium/32               time:   [3.0120 ms 3.0226 ms 3.0350 ms]
+//! medium/64               time:   [5.7405 ms 5.7787 ms 5.8166 ms]
+//! medium/128              time:   [10.412 ms 10.574 ms 10.718 ms]
 //! ```
 
 use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
+use once_cell::sync::Lazy;
 use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
+    future::Future,
     sync::Arc,
     time::{Duration, Instant},
 };
@@ -61,40 +71,59 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
-    }
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
-        }
+    macro_rules! bench_group {
+        ($name:expr, $redo_work:expr) => {{
+            let name: &str = $name;
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(name);
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients));
+                    },
+                );
+            }
+        }};
     }
+    //
+    // benchmark the protocol implementation
+    //
+    let pg_version = 14;
+    bench_group!(
+        "ping",
+        Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
+            let _: () = mgr.ping(pg_version).await.unwrap();
+        })
+    );
+    //
+    // benchmarks with actual record redo
+    //
+    let make_redo_work = |req: &'static Request| {
+        Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
+            let page = req.execute(&mgr).await.unwrap();
+            assert_eq!(page.remaining(), 8192);
+        })
+    };
+    bench_group!("short", {
+        static REQUEST: Lazy<Request> = Lazy::new(Request::short_input);
+        make_redo_work(&REQUEST)
+    });
+    bench_group!("medium", {
+        static REQUEST: Lazy<Request> = Lazy::new(Request::medium_input);
+        make_redo_work(&REQUEST)
+    });
 }
 criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl<F, Fut>(redo_work: Arc<F>, n_redos: u64, nclients: u64) -> Duration
+where
+    F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
     let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
@@ -135,17 +164,20 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
     })
 }
 
-async fn client(
+async fn client<F, Fut>(
     mgr: Arc<PostgresRedoManager>,
     start: Arc<Barrier>,
-    redo_work: Arc<Request>,
+    redo_work: Arc<F>,
     n_redos: u64,
-) -> Duration {
+) -> Duration
+where
+    F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
+    Fut: Future<Output = ()> + Send + 'static,
+{
     start.wait().await;
     let start = Instant::now();
     for _ in 0..n_redos {
-        let page = redo_work.execute(&mgr).await.unwrap();
-        assert_eq!(page.remaining(), 8192);
+        redo_work(Arc::clone(&mgr)).await;
         // The real pageserver will rarely if ever do 2 walredos in a row without
         // yielding to the executor.
         tokio::task::yield_now().await;
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 0fe7def8b05d..a1c9fc5651b1 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -205,6 +205,22 @@ impl PostgresRedoManager {
         }
     }
 
+    /// Do a ping request-response roundtrip.
+    ///
+    /// Not used in production, but by Rust benchmarks.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
+        self.do_with_walredo_process(pg_version, |proc| async move {
+            proc.ping(Duration::from_secs(1))
+                .await
+                .map_err(Error::Other)
+        })
+        .await
+    }
+
     pub fn status(&self) -> WalRedoManagerStatus {
         WalRedoManagerStatus {
             last_redo_at: {
@@ -297,6 +313,9 @@ impl PostgresRedoManager {
         }
     }
 
+    /// # Cancel-Safety
+    ///
+    /// This method is cancel-safe iff `closure` is cancel-safe.
     async fn do_with_walredo_process<
         F: FnOnce(Arc<Process>) -> Fut,
         Fut: Future<Output = Result<O, Error>>,
@@ -537,6 +556,17 @@ mod tests {
     use tracing::Instrument;
     use utils::{id::TenantId, lsn::Lsn};
 
+    #[tokio::test]
+    async fn test_ping() {
+        let h = RedoHarness::new().unwrap();
+
+        h.manager
+            .ping(14)
+            .instrument(h.span())
+            .await
+            .expect("ping should work");
+    }
+
     #[tokio::test]
     async fn short_v14_redo() {
         let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index 9140d4f6aa83..f3197e68b51f 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -6,6 +6,7 @@ use self::no_leak_child::NoLeakChild;
 use crate::{
     config::PageServerConf,
     metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    page_cache::PAGE_SZ,
     span::debug_assert_current_span_has_tenant_id,
     walrecord::NeonWalRecord,
 };
@@ -237,6 +238,26 @@ impl WalRedoProcess {
         res
     }
 
+    /// Do a ping request-response roundtrip.
+    ///
+    /// Not used in production, but by Rust benchmarks.
+    pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> {
+        let mut writebuf: Vec<u8> = Vec::with_capacity(4);
+        protocol::build_ping_msg(&mut writebuf);
+        let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo ping timed out");
+        };
+        let response = res?;
+        if response.len() != PAGE_SZ {
+            anyhow::bail!(
+                "WAL redo ping response should respond with page-sized response: {}",
+                response.len()
+            );
+        }
+        Ok(())
+    }
+
     /// # Cancel-Safety
     ///
     /// When not polled to completion (e.g. because in `tokio::select!` another
diff --git a/pageserver/src/walredo/process/protocol.rs b/pageserver/src/walredo/process/protocol.rs
index b703344cc8a9..de3ca8741b2e 100644
--- a/pageserver/src/walredo/process/protocol.rs
+++ b/pageserver/src/walredo/process/protocol.rs
@@ -55,3 +55,8 @@ pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
     tag.ser_into(buf)
         .expect("serialize BufferTag should always succeed");
 }
+
+pub(crate) fn build_ping_msg(buf: &mut Vec<u8>) {
+    buf.put_u8(b'H');
+    buf.put_u32(4);
+}
diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c
index 219ca852073d..f98aa1cbe7ec 100644
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -24,6 +24,7 @@
  * PushPage ('P'): Copy a page image (in the payload) to buffer cache
  * ApplyRecord ('A'): Apply a WAL record (in the payload)
  * GetPage ('G'): Return a page image from buffer cache.
+ * Ping ('H'): Return the input message.
  *
  * Currently, you only get a response to GetPage requests; the response is
  * simply a 8k page, without any headers. Errors are logged to stderr.
@@ -133,6 +134,7 @@ static void ApplyRecord(StringInfo input_message);
 static void apply_error_callback(void *arg);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
+static void Ping(StringInfo input_message);
 static ssize_t buffered_read(void *buf, size_t count);
 static void CreateFakeSharedMemoryAndSemaphores();
 
@@ -394,6 +396,10 @@ WalRedoMain(int argc, char *argv[])
 				GetPage(&input_message);
 				break;
 
+			case 'H': 			/* Ping */
+				Ping(&input_message);
+				break;
+
 				/*
 				 * EOF means we're done. Perform normal shutdown.
 				 */
@@ -1057,6 +1063,36 @@ GetPage(StringInfo input_message)
 }
 
 
+static void
+Ping(StringInfo input_message)
+{
+	int			tot_written;
+	/* Response: the input message */
+	tot_written = 0;
+	do {
+		ssize_t		rc;
+		/* We don't need alignment, but it's bad practice to use char[BLCKSZ] */
+#if PG_VERSION_NUM >= 160000
+		static const PGIOAlignedBlock response;
+#else
+		static const PGAlignedBlock response;
+#endif
+		rc = write(STDOUT_FILENO, &response.data[tot_written], BLCKSZ - tot_written);
+		if (rc < 0) {
+			/* If interrupted by signal, just retry */
+			if (errno == EINTR)
+				continue;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to stdout: %m")));
+		}
+		tot_written += rc;
+	} while (tot_written < BLCKSZ);
+
+	elog(TRACE, "Page sent back for ping");
+}
+
+
 /* Buffer used by buffered_read() */
 static char stdin_buf[16 * 1024];
 static size_t stdin_len = 0;	/* # of bytes in buffer */

From 4d5add9ca03462f14b6e63df55e6da6ed32c3d4d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 23 Sep 2024 15:05:22 +0200
Subject: [PATCH 35/77] compact_level0_phase1: remove final traces of value
 access mode config (#8935)

refs https://github.com/neondatabase/neon/issues/8184
stacked atop https://github.com/neondatabase/neon/pull/8934

This PR changes from ignoring the config field to rejecting configs that
contain it.

PR https://github.com/neondatabase/infra/pull/1903 removes the field
usage from `pageserver.toml`.

It rolls into prod sooner or in the same release as this PR.
---
 libs/pageserver_api/src/config.rs |  4 ----
 pageserver/src/config.rs          | 11 -----------
 2 files changed, 15 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 61e32bc9ab67..95310fdbac2b 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,9 +104,6 @@ pub struct ConfigToml {
     pub image_compression: ImageCompressionAlgorithm,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
-    #[serde(skip_serializing)]
-    // TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
-    pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
     pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
     pub io_buffer_alignment: usize,
 }
@@ -384,7 +381,6 @@ impl Default for ConfigToml {
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
-            compact_level0_phase1_value_access: Default::default(),
             virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
 
             io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8567c6aa52f1..e15f1c791b78 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -324,7 +324,6 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            compact_level0_phase1_value_access: _,
             l0_flush,
             virtual_file_direct_io,
             concurrent_tenant_warmup,
@@ -535,16 +534,6 @@ mod tests {
             .expect("parse_and_validate");
     }
 
-    #[test]
-    fn test_compactl0_phase1_access_mode_is_ignored_silently() {
-        let input = indoc::indoc! {r#"
-            [compact_level0_phase1_value_access]
-            mode = "streaming-kmerge"
-            validate = "key-lsn-value"
-        "#};
-        toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
-    }
-
     /// If there's a typo in the pageserver config, we'd rather catch that typo
     /// and fail pageserver startup than silently ignoring the typo, leaving whoever
     /// made it in the believe that their config change is effective.

From f446e08fb8ac68d5957b239d7f11c8f99536c960 Mon Sep 17 00:00:00 2001
From: Nikita Kalyanov <44959448+nikitakalyanov@users.noreply.github.com>
Date: Mon, 23 Sep 2024 16:53:06 +0300
Subject: [PATCH 36/77] change HTTP method to comply with spec (#9100)

There is discrepancy with the spec, it has PUT
---
 pageserver/client/src/mgmt_api.rs           | 2 +-
 pageserver/src/http/routes.rs               | 2 +-
 storage_controller/src/http.rs              | 2 +-
 storage_controller/src/pageserver_client.rs | 2 +-
 test_runner/fixtures/pageserver/http.py     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index a68f45a6d9db..2d95ac42e607 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -432,7 +432,7 @@ impl Client {
             self.mgmt_api_endpoint
         );
 
-        self.request(Method::POST, &uri, req)
+        self.request(Method::PUT, &uri, req)
             .await?
             .json()
             .await
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d645f3b7b647..6a10d4fb1c3e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2955,7 +2955,7 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
             |r| api_handler(r, timeline_preserve_initdb_handler),
         )
-        .post(
+        .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
             |r| api_handler(r, timeline_archival_config_handler),
         )
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 1745bf557525..95e4a469ac95 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1849,7 +1849,7 @@ pub fn make_router(
                 RequestName("v1_tenant_timeline"),
             )
         })
-        .post(
+        .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config",
             |r| {
                 tenant_service_handler(
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 961a1f78ddca..b19cbc4fa3cf 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -238,7 +238,7 @@ impl PageserverClient {
     ) -> Result<()> {
         measured_request!(
             "timeline_archival_config",
-            crate::metrics::Method::Post,
+            crate::metrics::Method::Put,
             &self.node_id_label,
             self.inner
                 .timeline_archival_config(tenant_shard_id, timeline_id, req)
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 582f9c026439..0dd557c59f28 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -631,7 +631,7 @@ def timeline_archival_config(
         log.info(
             f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
         )
-        res = self.post(
+        res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
             json=config,
         )

From 29699529dfdd4642d71e018047071a01dacb0cf0 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 23 Sep 2024 12:30:44 -0400
Subject: [PATCH 37/77] feat(pageserver): filter keys with gc-compaction
 (#9004)

Part of https://github.com/neondatabase/neon/issues/8002

Close https://github.com/neondatabase/neon/issues/8920

Legacy compaction (as well as gc-compaction) rely on the GC process to
remove unused layer files, but this relies on many factors (i.e., key
partition) to ensure data in a dropped table can be eventually removed.

In gc-compaction, we consider the keyspace information when doing the
compaction process. If a key is not in the keyspace, we will skip that
key and not include it in the final output.

However, this is not easy to implement because gc-compaction considers
branch points (i.e., retain_lsns) and the retained keyspaces could
change across different LSNs. Therefore, for now, we only remove aux v1
keys in the compaction process.

## Summary of changes

* Add `FilterIterator` to filter out keys.
* Integrate `FilterIterator` with gc-compaction.
* Add `collect_gc_compaction_keyspace` for a spec of keyspaces that can
be retained during the gc-compaction process.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs           |  30 +++
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 .../tenant/storage_layer/filter_iterator.rs   | 205 ++++++++++++++++++
 pageserver/src/tenant/timeline/compaction.rs  |   9 +-
 4 files changed, 244 insertions(+), 2 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/filter_iterator.rs

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 5f8766ca2c51..7aa313f03143 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -840,6 +840,36 @@ impl Timeline {
         Ok(total_size * BLCKSZ as u64)
     }
 
+    /// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
+    /// for gc-compaction.
+    ///
+    /// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
+    /// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
+    /// be kept only for a specific range of LSN.
+    ///
+    /// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
+    /// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
+    /// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
+    /// determine which keys to retain/drop for gc-compaction.
+    ///
+    /// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
+    /// to be retained for each of the branch LSN.
+    ///
+    /// The return value is (dense keyspace, sparse keyspace).
+    pub(crate) async fn collect_gc_compaction_keyspace(
+        &self,
+    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
+        let metadata_key_begin = Key::metadata_key_range().start;
+        let aux_v1_key = AUX_FILES_KEY;
+        let dense_keyspace = KeySpace {
+            ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
+        };
+        Ok((
+            dense_keyspace,
+            SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
+        ))
+    }
+
     ///
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index cd252aa37132..99bd0ece5763 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,13 +1,13 @@
 //! Common traits and structs for layers
 
 pub mod delta_layer;
+pub mod filter_iterator;
 pub mod image_layer;
 pub mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
-
 pub mod split_writer;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs
new file mode 100644
index 000000000000..f45dd4b80198
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -0,0 +1,205 @@
+use std::ops::Range;
+
+use anyhow::bail;
+use pageserver_api::{
+    key::Key,
+    keyspace::{KeySpace, SparseKeySpace},
+};
+use utils::lsn::Lsn;
+
+use crate::repository::Value;
+
+use super::merge_iterator::MergeIterator;
+
+/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
+///
+/// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys
+/// to be retained.
+pub struct FilterIterator<'a> {
+    inner: MergeIterator<'a>,
+    retain_key_filters: Vec<Range<Key>>,
+    current_filter_idx: usize,
+}
+
+impl<'a> FilterIterator<'a> {
+    pub fn create(
+        inner: MergeIterator<'a>,
+        dense_keyspace: KeySpace,
+        sparse_keyspace: SparseKeySpace,
+    ) -> anyhow::Result<Self> {
+        let mut retain_key_filters = Vec::new();
+        retain_key_filters.extend(dense_keyspace.ranges);
+        retain_key_filters.extend(sparse_keyspace.0.ranges);
+        retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start));
+        // Verify key filters are non-overlapping and sorted
+        for window in retain_key_filters.windows(2) {
+            if window[0].end > window[1].start {
+                bail!(
+                    "Key filters are overlapping: {:?} and {:?}",
+                    window[0],
+                    window[1]
+                );
+            }
+        }
+        Ok(Self {
+            inner,
+            retain_key_filters,
+            current_filter_idx: 0,
+        })
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        while let Some(item) = self.inner.next().await? {
+            while self.current_filter_idx < self.retain_key_filters.len()
+                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
+            {
+                // [filter region]    [filter region]     [filter region]
+                //                                     ^ item
+                //                    ^ current filter
+                self.current_filter_idx += 1;
+                // [filter region]    [filter region]     [filter region]
+                //                                     ^ item
+                //                                        ^ current filter
+            }
+            if self.current_filter_idx >= self.retain_key_filters.len() {
+                // We already exhausted all filters, so we should return now
+                // [filter region] [filter region] [filter region]
+                //                                                    ^ item
+                //                                                 ^ current filter (nothing)
+                return Ok(None);
+            }
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
+                // [filter region]    [filter region]     [filter region]
+                //                                              ^ item
+                //                                        ^ current filter
+                return Ok(Some(item));
+            }
+            // If the key is not contained in the key retaining filters, continue to the next item.
+            // [filter region]    [filter region]     [filter region]
+            //                                     ^ item
+            //                                        ^ current filter
+        }
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use itertools::Itertools;
+    use pageserver_api::key::Key;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::delta_layer::test::produce_delta_layer,
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    async fn assert_filter_iter_equal(
+        filter_iter: &mut FilterIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = filter_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn filter_keyspace_iterator() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
+            .await
+            .unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 100;
+        let test_deltas1 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+
+        let merge_iter = MergeIterator::create(
+            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
+            &[],
+            &ctx,
+        );
+
+        let mut filter_iter = FilterIterator::create(
+            merge_iter,
+            KeySpace {
+                ranges: vec![
+                    get_key(5)..get_key(10),
+                    get_key(20)..get_key(30),
+                    get_key(90)..get_key(110),
+                    get_key(1000)..get_key(2000),
+                ],
+            },
+            SparseKeySpace(KeySpace::default()),
+        )
+        .unwrap();
+        let mut result = Vec::new();
+        result.extend(test_deltas1[5..10].iter().cloned());
+        result.extend(test_deltas1[20..30].iter().cloned());
+        result.extend(test_deltas1[90..100].iter().cloned());
+        assert_filter_iter_equal(&mut filter_iter, &result).await;
+
+        let merge_iter = MergeIterator::create(
+            &[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
+            &[],
+            &ctx,
+        );
+
+        let mut filter_iter = FilterIterator::create(
+            merge_iter,
+            KeySpace {
+                ranges: vec![
+                    get_key(0)..get_key(10),
+                    get_key(20)..get_key(30),
+                    get_key(90)..get_key(95),
+                ],
+            },
+            SparseKeySpace(KeySpace::default()),
+        )
+        .unwrap();
+        let mut result = Vec::new();
+        result.extend(test_deltas1[0..10].iter().cloned());
+        result.extend(test_deltas1[20..30].iter().cloned());
+        result.extend(test_deltas1[90..95].iter().cloned());
+        assert_filter_iter_equal(&mut filter_iter, &result).await;
+    }
+}
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index d1567b6b39f3..6edc28a11b2e 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -31,6 +31,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
+use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::split_writer::{
     SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
@@ -1772,6 +1773,7 @@ impl Timeline {
             gc_cutoff,
             lowest_retain_lsn
         );
+
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
         let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
@@ -1820,7 +1822,12 @@ impl Timeline {
                 image_layers.push(layer);
             }
         }
-        let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
+        let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
+        let mut merge_iter = FilterIterator::create(
+            MergeIterator::create(&delta_layers, &image_layers, ctx),
+            dense_ks,
+            sparse_ks,
+        )?;
         // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
         // Data of the same key.
         let mut accumulated_values = Vec::new();

From df3996265f423a727482c46eefed9e8fd266af7d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 21:10:22 +0300
Subject: [PATCH 38/77] test: Downgrade info message on removing empty
 directories (#9093)

It was pretty noisy. It changed from debug to info level in commit
78938d1b59, but I believe that was not purpose.
---
 test_runner/fixtures/neon_fixtures.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index fc83cf3f7c6a..55c1423ed0d5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -849,7 +849,7 @@ def cleanup_local_storage(self):
 
         for directory_to_clean in reversed(directories_to_clean):
             if not os.listdir(directory_to_clean):
-                log.info(f"Removing empty directory {directory_to_clean}")
+                log.debug(f"Removing empty directory {directory_to_clean}")
                 try:
                     directory_to_clean.rmdir()
                 except Exception as e:

From 263dfba6eeef448864dba151e2d8d34a418b9629 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 21:28:50 +0300
Subject: [PATCH 39/77] Add views for metrics about pageserver requests (#9008)

The metrics include a histogram of how long we need to wait for a
GetPage request, number of reconnects, and number of requests among
other things.

The metrics are not yet exported anywhere, but you can query them
manually.

Note: This does *not* bump the default version of the 'neon' extension. We
will do that later, as a separate PR. The reason is that this allows us to roll back
the compute image smoothly, if necessary. Once the image that includes the
new extension .so file with the new functions has been rolled out, and we're
confident that we don't need to roll back the image anymore, we can change
default extension version and actually start using the new functions and views.

This is what the view looks like:

```
postgres=# select * from neon_perf_counters ;
                metric                 | bucket_le |  value
---------------------------------------+-----------+----------
 getpage_wait_seconds_count            |           |      300
 getpage_wait_seconds_sum              |           | 0.048506
 getpage_wait_seconds_bucket           |     2e-05 |        0
 getpage_wait_seconds_bucket           |     3e-05 |        0
 getpage_wait_seconds_bucket           |     6e-05 |       71
 getpage_wait_seconds_bucket           |    0.0001 |      124
 getpage_wait_seconds_bucket           |    0.0002 |      248
 getpage_wait_seconds_bucket           |    0.0003 |      279
 getpage_wait_seconds_bucket           |    0.0006 |      297
 getpage_wait_seconds_bucket           |     0.001 |      298
 getpage_wait_seconds_bucket           |     0.002 |      298
 getpage_wait_seconds_bucket           |     0.003 |      298
 getpage_wait_seconds_bucket           |     0.006 |      300
 getpage_wait_seconds_bucket           |      0.01 |      300
 getpage_wait_seconds_bucket           |      0.02 |      300
 getpage_wait_seconds_bucket           |      0.03 |      300
 getpage_wait_seconds_bucket           |      0.06 |      300
 getpage_wait_seconds_bucket           |       0.1 |      300
 getpage_wait_seconds_bucket           |       0.2 |      300
 getpage_wait_seconds_bucket           |       0.3 |      300
 getpage_wait_seconds_bucket           |       0.6 |      300
 getpage_wait_seconds_bucket           |         1 |      300
 getpage_wait_seconds_bucket           |         2 |      300
 getpage_wait_seconds_bucket           |         3 |      300
 getpage_wait_seconds_bucket           |         6 |      300
 getpage_wait_seconds_bucket           |        10 |      300
 getpage_wait_seconds_bucket           |        20 |      300
 getpage_wait_seconds_bucket           |        30 |      300
 getpage_wait_seconds_bucket           |        60 |      300
 getpage_wait_seconds_bucket           |       100 |      300
 getpage_wait_seconds_bucket           |  Infinity |      300
 getpage_prefetch_requests_total       |           |       69
 getpage_sync_requests_total           |           |      231
 getpage_prefetch_misses_total         |           |        0
 getpage_prefetch_discards_total       |           |        0
 pageserver_requests_sent_total        |           |      323
 pageserver_requests_disconnects_total |           |        0
 pageserver_send_flushes_total         |           |      323
 file_cache_hits_total                 |           |        0
(39 rows)
```
---
 pgxn/neon/Makefile                          |   4 +-
 pgxn/neon/libpagestore.c                    |  10 +-
 pgxn/neon/neon--1.4--1.5.sql                |  39 +++
 pgxn/neon/neon--1.5--1.4.sql                |   4 +
 pgxn/neon/neon.control                      |   2 +
 pgxn/neon/neon_perf_counters.c              | 261 ++++++++++++++++++++
 pgxn/neon/neon_perf_counters.h              | 111 +++++++++
 pgxn/neon/neon_pgversioncompat.c            |  44 ++++
 pgxn/neon/neon_pgversioncompat.h            |   6 +
 pgxn/neon/pagestore_smgr.c                  |  47 ++--
 test_runner/regress/test_compute_metrics.py |  21 ++
 test_runner/regress/test_neon_extension.py  |   4 +-
 12 files changed, 533 insertions(+), 20 deletions(-)
 create mode 100644 pgxn/neon/neon--1.4--1.5.sql
 create mode 100644 pgxn/neon/neon--1.5--1.4.sql
 create mode 100644 pgxn/neon/neon_perf_counters.c
 create mode 100644 pgxn/neon/neon_perf_counters.h
 create mode 100644 pgxn/neon/neon_pgversioncompat.c
 create mode 100644 test_runner/regress/test_compute_metrics.py

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 3b755bb0420c..ddc8155ff33a 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,6 +9,8 @@ OBJS = \
 	hll.o \
 	libpagestore.o \
 	neon.o \
+	neon_pgversioncompat.o \
+	neon_perf_counters.o \
 	neon_utils.o \
 	neon_walreader.o \
 	pagestore_smgr.o \
@@ -23,7 +25,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index df7000acc0f7..07a19a71146f 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -30,6 +30,7 @@
 #include "utils/guc.h"
 
 #include "neon.h"
+#include "neon_perf_counters.h"
 #include "neon_utils.h"
 #include "pagestore_client.h"
 #include "walproposer.h"
@@ -331,6 +332,7 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
 	}
 	if (shard->conn)
 	{
+		MyNeonCounters->pageserver_disconnects_total++;
 		PQfinish(shard->conn);
 		shard->conn = NULL;
 	}
@@ -737,6 +739,8 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn;
 
+	MyNeonCounters->pageserver_requests_sent_total++;
+
 	/* If the connection was lost for some reason, reconnect */
 	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
 	{
@@ -889,6 +893,7 @@ pageserver_flush(shardno_t shard_no)
 	}
 	else
 	{
+		MyNeonCounters->pageserver_send_flushes_total++;
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -922,7 +927,7 @@ check_neon_id(char **newval, void **extra, GucSource source)
 static Size
 PagestoreShmemSize(void)
 {
-	return sizeof(PagestoreShmemState);
+	return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
 }
 
 static bool
@@ -941,6 +946,9 @@ PagestoreShmemInit(void)
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
+
+	NeonPerfCountersShmemInit();
+
 	LWLockRelease(AddinShmemInitLock);
 	return found;
 }
diff --git a/pgxn/neon/neon--1.4--1.5.sql b/pgxn/neon/neon--1.4--1.5.sql
new file mode 100644
index 000000000000..a1db7bf1b1c1
--- /dev/null
+++ b/pgxn/neon/neon--1.4--1.5.sql
@@ -0,0 +1,39 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit
+
+
+CREATE FUNCTION get_backend_perf_counters()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'neon_get_backend_perf_counters'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE FUNCTION get_perf_counters()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'neon_get_perf_counters'
+LANGUAGE C PARALLEL SAFE;
+
+-- Show various metrics, for each backend. Note that the values are not reset
+-- when a backend exits. When a new backend starts with the backend ID, it will
+-- continue accumulating the values from where the old backend left. If you are
+-- only interested in the changes from your own session, store the values at the
+-- beginning of the session somewhere, and subtract them on subsequent calls.
+--
+-- For histograms, 'bucket_le' is the upper bound of the histogram bucket.
+CREATE VIEW neon_backend_perf_counters AS
+  SELECT P.procno, P.pid, P.metric, P.bucket_le, P.value
+  FROM get_backend_perf_counters() AS P (
+    procno integer,
+    pid integer,
+    metric text,
+    bucket_le float8,
+    value float8
+  );
+
+-- Summary across all backends. (This could also be implemented with
+-- an aggregate query over neon_backend_perf_counters view.)
+CREATE VIEW neon_perf_counters AS
+  SELECT P.metric, P.bucket_le, P.value
+  FROM get_perf_counters() AS P (
+    metric text,
+    bucket_le float8,
+    value float8
+  );
diff --git a/pgxn/neon/neon--1.5--1.4.sql b/pgxn/neon/neon--1.5--1.4.sql
new file mode 100644
index 000000000000..7939fd8aa9dc
--- /dev/null
+++ b/pgxn/neon/neon--1.5--1.4.sql
@@ -0,0 +1,4 @@
+DROP VIEW IF EXISTS neon_perf_counters;
+DROP VIEW IF EXISTS neon_backend_perf_counters;
+DROP FUNCTION IF EXISTS get_perf_counters();
+DROP FUNCTION IF EXISTS get_backend_perf_counters();
diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index 03bdb9a0b41c..0b36bdbb655a 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,5 +1,7 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
+# TODO: bump default version to 1.5, after we are certain that we don't
+# need to rollback the compute image
 default_version = '1.4'
 module_pathname = '$libdir/neon'
 relocatable = true
diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c
new file mode 100644
index 000000000000..3e86d5b26276
--- /dev/null
+++ b/pgxn/neon/neon_perf_counters.c
@@ -0,0 +1,261 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon_perf_counters.c
+ *	  Collect statistics about Neon I/O
+ *
+ * Each backend has its own set of counters in shared memory.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+
+#include "neon_perf_counters.h"
+#include "neon_pgversioncompat.h"
+
+neon_per_backend_counters *neon_per_backend_counters_shared;
+
+Size
+NeonPerfCountersShmemSize(void)
+{
+	Size		size = 0;
+
+	size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters)));
+
+	return size;
+}
+
+bool
+NeonPerfCountersShmemInit(void)
+{
+	bool		found;
+
+	neon_per_backend_counters_shared =
+		ShmemInitStruct("Neon perf counters",
+						mul_size(MaxBackends,
+								 sizeof(neon_per_backend_counters)),
+						&found);
+	Assert(found == IsUnderPostmaster);
+	if (!found)
+	{
+		/* shared memory is initialized to zeros, so nothing to do here */
+	}
+}
+
+/*
+ * Count a GetPage wait operation.
+ */
+void
+inc_getpage_wait(uint64 latency_us)
+{
+	int			lo = 0;
+	int			hi = NUM_GETPAGE_WAIT_BUCKETS - 1;
+
+	/* Find the right bucket with binary search */
+	while (lo < hi)
+	{
+		int			mid = (lo + hi) / 2;
+
+		if (latency_us < getpage_wait_bucket_thresholds[mid])
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+	MyNeonCounters->getpage_wait_us_bucket[lo]++;
+	MyNeonCounters->getpage_wait_us_sum += latency_us;
+	MyNeonCounters->getpage_wait_us_count++;
+}
+
+/*
+ * Support functions for the views, neon_backend_perf_counters and
+ * neon_perf_counters.
+ */
+
+typedef struct
+{
+	char	   *name;
+	bool		is_bucket;
+	double		bucket_le;
+	double		value;
+} metric_t;
+
+static metric_t *
+neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
+{
+#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8)
+	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
+	uint64		bucket_accum;
+	int			i = 0;
+	Datum		getpage_wait_str;
+
+	metrics[i].name = "getpage_wait_seconds_count";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_wait_us_count;
+	i++;
+	metrics[i].name = "getpage_wait_seconds_sum";
+	metrics[i].is_bucket = false;
+	metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0;
+	i++;
+
+	bucket_accum = 0;
+	for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
+	{
+		uint64		threshold = getpage_wait_bucket_thresholds[bucketno];
+
+		bucket_accum += counters->getpage_wait_us_bucket[bucketno];
+
+		metrics[i].name = "getpage_wait_seconds_bucket";
+		metrics[i].is_bucket = true;
+		metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
+		metrics[i].value = (double) bucket_accum;
+		i++;
+	}
+	metrics[i].name = "getpage_prefetch_requests_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_requests_total;
+	i++;
+	metrics[i].name = "getpage_sync_requests_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_sync_requests_total;
+	i++;
+	metrics[i].name = "getpage_prefetch_misses_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_misses_total;
+	i++;
+	metrics[i].name = "getpage_prefetch_discards_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->getpage_prefetch_discards_total;
+	i++;
+	metrics[i].name = "pageserver_requests_sent_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_requests_sent_total;
+	i++;
+	metrics[i].name = "pageserver_requests_disconnects_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_disconnects_total;
+	i++;
+	metrics[i].name = "pageserver_send_flushes_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->pageserver_send_flushes_total;
+	i++;
+	metrics[i].name = "file_cache_hits_total";
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) counters->file_cache_hits_total;
+	i++;
+
+	Assert(i == NUM_METRICS);
+
+	/* NULL entry marks end of array */
+	metrics[i].name = NULL;
+	metrics[i].value = 0;
+
+	return metrics;
+}
+
+/*
+ * Write metric to three output Datums
+ */
+static void
+metric_to_datums(metric_t *m, Datum *values, bool *nulls)
+{
+	values[0] = CStringGetTextDatum(m->name);
+	nulls[0] = false;
+	if (m->is_bucket)
+	{
+		values[1] = Float8GetDatum(m->bucket_le);
+		nulls[1] = false;
+	}
+	else
+	{
+		values[1] = (Datum) 0;
+		nulls[1] = true;
+	}
+	values[2] = Float8GetDatum(m->value);
+	nulls[2] = false;
+}
+
+PG_FUNCTION_INFO_V1(neon_get_backend_perf_counters);
+Datum
+neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum		values[5];
+	bool		nulls[5];
+
+	/* We put all the tuples into a tuplestore in one go. */
+	InitMaterializedSRF(fcinfo, 0);
+
+	for (int procno = 0; procno < MaxBackends; procno++)
+	{
+		PGPROC	   *proc = GetPGProcByNumber(procno);
+		int			pid = proc->pid;
+		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
+		metric_t   *metrics = neon_perf_counters_to_metrics(counters);
+
+		values[0] = Int32GetDatum(procno);
+		nulls[0] = false;
+		values[1] = Int32GetDatum(pid);
+		nulls[1] = false;
+
+		for (int i = 0; metrics[i].name != NULL; i++)
+		{
+			metric_to_datums(&metrics[i], &values[2], &nulls[2]);
+			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+		}
+
+		pfree(metrics);
+	}
+
+	return (Datum) 0;
+}
+
+PG_FUNCTION_INFO_V1(neon_get_perf_counters);
+Datum
+neon_get_perf_counters(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Datum		values[3];
+	bool		nulls[3];
+	Datum		getpage_wait_str;
+	neon_per_backend_counters totals = {0};
+	metric_t   *metrics;
+
+	/* We put all the tuples into a tuplestore in one go. */
+	InitMaterializedSRF(fcinfo, 0);
+
+	/* Aggregate the counters across all backends */
+	for (int procno = 0; procno < MaxBackends; procno++)
+	{
+		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
+
+		totals.getpage_wait_us_count += counters->getpage_wait_us_count;
+		totals.getpage_wait_us_sum += counters->getpage_wait_us_sum;
+		for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
+			totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno];
+		totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
+		totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
+		totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
+		totals.getpage_prefetch_discards_total += counters->getpage_prefetch_discards_total;
+		totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total;
+		totals.pageserver_disconnects_total += counters->pageserver_disconnects_total;
+		totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total;
+		totals.file_cache_hits_total += counters->file_cache_hits_total;
+	}
+
+	metrics = neon_perf_counters_to_metrics(&totals);
+	for (int i = 0; metrics[i].name != NULL; i++)
+	{
+		metric_to_datums(&metrics[i], &values[0], &nulls[0]);
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+	}
+	pfree(metrics);
+
+	return (Datum) 0;
+}
diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h
new file mode 100644
index 000000000000..ae35e8c3a515
--- /dev/null
+++ b/pgxn/neon/neon_perf_counters.h
@@ -0,0 +1,111 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon_perf_counters.h
+ *	  Performance counters for neon storage requests
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef NEON_PERF_COUNTERS_H
+#define NEON_PERF_COUNTERS_H
+
+#if PG_VERSION_NUM >= 170000
+#include "storage/procnumber.h"
+#else
+#include "storage/backendid.h"
+#include "storage/proc.h"
+#endif
+
+static const uint64 getpage_wait_bucket_thresholds[] = {
+	      20,       30,       60,       100,  /* 0      -  100 us */
+	     200,      300,      600,	   1000,  /* 100 us - 1 ms */
+	    2000,     3000,     6000,     10000,  /* 1 ms   - 10 ms */
+	   20000,    30000,    60000,    100000,  /* 10 ms  - 100 ms */
+	  200000,   300000,   600000,   1000000,  /* 100 ms - 1 s */
+	 2000000,  3000000,  6000000,  10000000,  /* 1 s - 10 s */
+    20000000, 30000000, 60000000, 100000000,  /* 10 s - 100 s */
+	UINT64_MAX,
+};
+#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds))
+
+typedef struct
+{
+	/*
+	 * Histogram for how long an smgrread() request needs to wait for response
+	 * from pageserver. When prefetching is effective, these wait times can be
+	 * lower than the network latency to the pageserver, even zero, if the
+	 * page is already readily prefetched whenever we need to read a page.
+	 *
+	 * Note: we accumulate these in microseconds, because that's convenient in
+	 * the backend, but the 'neon_backend_perf_counters' view will convert
+	 * them to seconds, to make them more idiomatic as prometheus metrics.
+	 */
+	uint64		getpage_wait_us_count;
+	uint64		getpage_wait_us_sum;
+	uint64		getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS];
+
+	/*
+	 * Total number of speculative prefetch Getpage requests and synchronous
+	 * GetPage requests sent.
+	 */
+	uint64		getpage_prefetch_requests_total;
+	uint64		getpage_sync_requests_total;
+
+	/* XXX: It's not clear to me when these misses happen. */
+	uint64		getpage_prefetch_misses_total;
+
+	/*
+	 * Number of prefetched responses that were discarded becuase the
+	 * prefetched page was not needed or because it was concurrently fetched /
+	 * modified by another backend.
+	 */
+	uint64		getpage_prefetch_discards_total;
+
+	/*
+	 * Total number of requests send to pageserver. (prefetch_requests_total
+	 * and sync_request_total count only GetPage requests, this counts all
+	 * request types.)
+	 */
+	uint64		pageserver_requests_sent_total;
+
+	/*
+	 * Number of times the connection to the pageserver was lost and the
+	 * backend had to reconnect. Note that this doesn't count the first
+	 * connection in each backend, only reconnects.
+	 */
+	uint64		pageserver_disconnects_total;
+
+	/*
+	 * Number of network flushes to the pageserver. Synchronous requests are
+	 * flushed immediately, but when prefetching requests are sent in batches,
+	 * this can be smaller than pageserver_requests_sent_total.
+	 */
+	uint64		pageserver_send_flushes_total;
+
+	/*
+	 * Number of requests satisfied from the LFC.
+	 *
+	 * This is redundant with the server-wide file_cache_hits, but this gives
+	 * per-backend granularity, and it's handy to have this in the same place
+	 * as counters for requests that went to the pageserver. Maybe move all
+	 * the LFC stats to this struct in the future?
+	 */
+	uint64		file_cache_hits_total;
+
+} neon_per_backend_counters;
+
+/* Pointer to the shared memory array of neon_per_backend_counters structs */
+extern neon_per_backend_counters *neon_per_backend_counters_shared;
+
+#if PG_VERSION_NUM >= 170000
+#define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber])
+#else
+#define MyNeonCounters (&neon_per_backend_counters_shared[MyProc->pgprocno])
+#endif
+
+extern void inc_getpage_wait(uint64 latency);
+
+extern Size NeonPerfCountersShmemSize(void);
+extern bool NeonPerfCountersShmemInit(void);
+
+
+#endif							/* NEON_PERF_COUNTERS_H */
diff --git a/pgxn/neon/neon_pgversioncompat.c b/pgxn/neon/neon_pgversioncompat.c
new file mode 100644
index 000000000000..a0dbddde4b91
--- /dev/null
+++ b/pgxn/neon/neon_pgversioncompat.c
@@ -0,0 +1,44 @@
+/*
+ * Support functions for the compatibility macros in neon_pgversioncompat.h
+ */
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "utils/tuplestore.h"
+
+#include "neon_pgversioncompat.h"
+
+#if PG_MAJORVERSION_NUM < 15
+void
+InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	Tuplestorestate *tupstore;
+	MemoryContext old_context,
+				per_query_ctx;
+	TupleDesc	stored_tupdesc;
+
+	/* check to see if caller supports returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+
+	/*
+	 * Store the tuplestore and the tuple descriptor in ReturnSetInfo.  This
+	 * must be done in the per-query memory context.
+	 */
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	old_context = MemoryContextSwitchTo(per_query_ctx);
+
+	if (get_call_result_type(fcinfo, NULL, &stored_tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	tupstore = tuplestore_begin_heap(false, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = stored_tupdesc;
+	MemoryContextSwitchTo(old_context);
+}
+#endif
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index 59b97d64fed5..e4754ec7ea82 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -6,6 +6,8 @@
 #ifndef NEON_PGVERSIONCOMPAT_H
 #define NEON_PGVERSIONCOMPAT_H
 
+#include "fmgr.h"
+
 #if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
 #else
@@ -123,4 +125,8 @@
 #define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
 #endif
 
+#if PG_MAJORVERSION_NUM < 15
+extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
+#endif
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 36538ea5e20f..1c87f4405cf2 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -66,6 +66,7 @@
 #include "storage/md.h"
 #include "storage/smgr.h"
 
+#include "neon_perf_counters.h"
 #include "pagestore_client.h"
 #include "bitmap.h"
 
@@ -289,7 +290,6 @@ static PrefetchState *MyPState;
 
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
 static bool prefetch_read(PrefetchRequest *slot);
 static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
 static bool prefetch_wait_for(uint64 ring_index);
@@ -780,21 +780,27 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 }
 
 /*
- * prefetch_register_buffer() - register and prefetch buffer
+ * prefetch_register_bufferv() - register and prefetch buffers
  *
  * Register that we may want the contents of BufferTag in the near future.
+ * This is used when issuing a speculative prefetch request, but also when
+ * performing a synchronous request and need the buffer right now.
  *
  * If force_request_lsns is not NULL, those values are sent to the
  * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
  * to calculate the LSNs to send.
  *
+ * When performing a prefetch rather than a synchronous request,
+ * is_prefetch==true. Currently, it only affects how the request is accounted
+ * in the perf counters.
+ *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
-
 static uint64
 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
-						  BlockNumber nblocks, const bits8 *mask)
+						  BlockNumber nblocks, const bits8 *mask,
+						  bool is_prefetch)
 {
 	uint64		min_ring_index;
 	PrefetchRequest req;
@@ -815,6 +821,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 		PrfHashEntry *entry = NULL;
 		uint64		ring_index;
 		neon_request_lsns *lsns;
+
 		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
 			continue;
 
@@ -858,6 +865,7 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 					slot = NULL;
+					MyNeonCounters->getpage_prefetch_discards_total++;
 				}
 			}
 
@@ -972,6 +980,11 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 
 		min_ring_index = Min(min_ring_index, ring_index);
 
+		if (is_prefetch)
+			MyNeonCounters->getpage_prefetch_requests_total++;
+		else
+			MyNeonCounters->getpage_sync_requests_total++;
+
 		prefetch_do_request(slot, lsns);
 	}
 
@@ -1000,13 +1013,6 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 }
 
 
-static uint64
-prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
-{
-	return prefetch_register_bufferv(tag, force_request_lsns, 1, NULL);
-}
-
-
 /*
  * Note: this function can get canceled and use a long jump to the next catch
  * context. Take care.
@@ -2612,7 +2618,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			lfc_present[i] = ~(lfc_present[i]);
 
 		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
-											   lfc_present);
+											   lfc_present, true);
 		nblocks -= iterblocks;
 		blocknum += iterblocks;
 
@@ -2656,7 +2662,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
-	ring_index = prefetch_register_buffer(tag, NULL);
+	ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true);
 
 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
@@ -2747,17 +2753,20 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 	 * weren't for the behaviour of the LwLsn cache that uses the highest
 	 * value of the LwLsn cache when the entry is not found.
 	 */
-	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask);
+	prefetch_register_bufferv(buftag, request_lsns, nblocks, mask, false);
 
 	for (int i = 0; i < nblocks; i++)
 	{
 		void	   *buffer = buffers[i];
 		BlockNumber blockno = base_blockno + i;
 		neon_request_lsns *reqlsns = &request_lsns[i];
+		TimestampTz		start_ts, end_ts;
 
 		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
 			continue;
 
+		start_ts = GetCurrentTimestamp();
+
 		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
 			XLogWaitForReplayOf(reqlsns[0].request_lsn);
 
@@ -2794,6 +2803,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 				/* drop caches */
 				prefetch_set_unused(slot->my_ring_index);
 				pgBufferUsage.prefetch.expired += 1;
+				MyNeonCounters->getpage_prefetch_discards_total++;
 				/* make it look like a prefetch cache miss */
 				entry = NULL;
 			}
@@ -2804,8 +2814,9 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 			if (entry == NULL)
 			{
 				pgBufferUsage.prefetch.misses += 1;
+				MyNeonCounters->getpage_prefetch_misses_total++;
 
-				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL);
+				ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL, false);
 				Assert(ring_index != UINT64_MAX);
 				slot = GetPrfSlot(ring_index);
 			}
@@ -2860,6 +2871,9 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 		/* buffer was used, clean up for later reuse */
 		prefetch_set_unused(ring_index);
 		prefetch_cleanup_trailing_unused();
+
+		end_ts = GetCurrentTimestamp();
+		inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
 	}
 }
 
@@ -2913,6 +2927,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	/* Try to read from local file cache */
 	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
 	{
+		MyNeonCounters->file_cache_hits_total++;
 		return;
 	}
 
@@ -3097,7 +3112,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				/* assume heap */
 				RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
 				RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
-	
+
 				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
 				{
 					neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
new file mode 100644
index 000000000000..6138c322d7c6
--- /dev/null
+++ b/test_runner/regress/test_compute_metrics.py
@@ -0,0 +1,21 @@
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_compute_metrics(neon_simple_env: NeonEnv):
+    """
+    Test compute metrics, exposed in the neon_backend_perf_counters and
+    neon_perf_counters views
+    """
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    # We don't check that the values make sense, this is just a very
+    # basic check that the server doesn't crash or something like that.
+    #
+    # 1.5 is the minimum version to contain these views.
+    cur.execute("CREATE EXTENSION neon VERSION '1.5'")
+    cur.execute("SELECT * FROM neon_perf_counters")
+    cur.execute("SELECT * FROM neon_backend_perf_counters")
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index bb844244e329..22a6013225af 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -50,8 +50,8 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
             # Ensure that the default version is also updated in the neon.control file
             assert cur.fetchone() == ("1.4",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
-            all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
-            current_version = "1.4"
+            all_versions = ["1.5", "1.4", "1.3", "1.2", "1.1", "1.0"]
+            current_version = "1.5"
             for idx, begin_version in enumerate(all_versions):
                 for target_version in all_versions[idx + 1 :]:
                     if current_version != begin_version:

From 1c5d6e59a0c53349b58a7f1af1f9d021d34b147a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 23 Sep 2024 22:05:32 +0300
Subject: [PATCH 40/77] Maintain number of used pages for LFC (#9088)

## Problem

LFC cache entry is chunk (right now size of chunk is 1Mb). LFC
statistics shows number of chunks, but not number of used pages. And
autoscaling team wants to know how sparse LFC is:
https://neondb.slack.com/archives/C04DGM6SMTM/p1726782793595969
It is possible to obtain it from the view `select count(*) from
local_cache`.
Nut it is expensive operation, enumerating all entries in LFC under
lock.

## Summary of changes

This PR added "file_cache_used_pages" to `neon_lfc_stats` view:
```
 select * from neon_lfc_stats;
        lfc_key        | lfc_value
-----------------------+-----------
 file_cache_misses     |   3139029
 file_cache_hits       |   4098394
 file_cache_used       |      1024
 file_cache_writes     |   3173728
 file_cache_size       |      1024
 file_cache_used_pages |     25689
(6 rows)
```

Please notice that this PR doesn't change neon extension API, so no need
to create new version of Neon extension.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index ab6739465b39..2b461c86419a 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -109,6 +109,7 @@ typedef struct FileCacheControl
 								 * reenabling */
 	uint32		size;			/* size of cache file in chunks */
 	uint32		used;			/* number of used chunks */
+	uint32		used_pages;		/* number of used pages */
 	uint32		limit;			/* shared copy of lfc_size_limit */
 	uint64		hits;
 	uint64		misses;
@@ -905,6 +906,10 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				/* Cache overflow: evict least recently used chunk */
 				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
 	
+				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+				{
+					lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+				}
 				CriticalAssert(victim->access_count == 0);
 				entry->offset = victim->offset; /* grab victim's chunk */
 				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -959,6 +964,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 				for (int i = 0; i < blocks_in_chunk; i++)
 				{
+					lfc_ctl->used_pages += 1 - ((entry->bitmap[(chunk_offs + i) >> 5] >> ((chunk_offs + i) & 31)) & 1);
 					entry->bitmap[(chunk_offs + i) >> 5] |=
 						(1 << ((chunk_offs + i) & 31));
 				}
@@ -1051,6 +1057,11 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->size;
 			break;
+		case 5:
+			key = "file_cache_used_pages";
+			if (lfc_ctl)
+				value = lfc_ctl->used_pages;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}

From d865881d59621e2425dd9028f2768c1e847163bf Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 23 Sep 2024 23:16:42 +0200
Subject: [PATCH 41/77] NOAI (#9084)

We can't FlushOneBuffer when we're in redo-only mode on PageServer, so
make execution of that function conditional on us not running in
pageserver walredo mode.
---
 .github/workflows/build_and_test.yml |  1 -
 test_runner/regress/test_unlogged.py | 16 +++++++++++++++-
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 vendor/postgres-v16                  |  2 +-
 vendor/postgres-v17                  |  2 +-
 vendor/revisions.json                |  8 ++++----
 7 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6617ca42bb54..f36dbfb1f061 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1207,7 +1207,6 @@ jobs:
     # Usually we do `needs: [...]`
     needs:
       - build-and-test-locally
-      - check-submodules
       - check-codestyle-python
       - check-codestyle-rust
       - promote-images
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
index deba29536c4d..4431ccd9597d 100644
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -15,8 +15,13 @@ def test_unlogged(neon_simple_env: NeonEnv):
     cur = conn.cursor()
 
     cur.execute("CREATE UNLOGGED TABLE iut (id int);")
-    # create index to test unlogged index relation as well
+    # create index to test unlogged index relations as well
     cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
+    cur.execute("CREATE INDEX ON iut USING gist (int4range(id, id, '[]'));")
+    cur.execute("CREATE INDEX ON iut USING spgist (int4range(id, id, '[]'));")
+    cur.execute("CREATE INDEX ON iut USING gin ((id::text::jsonb));")
+    cur.execute("CREATE INDEX ON iut USING brin (id);")
+    cur.execute("CREATE INDEX ON iut USING hash (id);")
     cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;")
     cur.execute("INSERT INTO iut (id) values (42);")
 
@@ -39,3 +44,12 @@ def test_unlogged(neon_simple_env: NeonEnv):
         assert results == [(43, 2)]
     else:
         assert results == [(43, 1)]
+
+    # Flush all data and compact it, so we detect any errors related to
+    # unlogged indexes materialization.
+    ps_http = env.pageserver.http_client()
+    ps_http.timeline_compact(
+        tenant_id=env.initial_tenant,
+        timeline_id=env.initial_timeline,
+        force_image_layer_creation=True,
+    )
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index a38d15f3233a..2199b83fb726 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit a38d15f3233a4c07f2bf3335fcbd874dd1f4e386
+Subproject commit 2199b83fb72680001ce0f43bf6187a21dfb8f45d
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 16c3c6b64f14..22e580fe9ffc 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 16c3c6b64f1420a367a2a9b2510f20d94f791af8
+Subproject commit 22e580fe9ffcea7e02592110b1c9bf426d83cada
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 1d7081a3b076..e131a9c027b2 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 1d7081a3b076ddf5086e0b118d4329820e6a7427
+Subproject commit e131a9c027b202ce92bd7b9cf2569d48a6f9948e
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 2cf120e7393c..7b3e52c75ca3 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 2cf120e7393ca5f537c6a38b457585576dc035fc
+Subproject commit 7b3e52c75ca384de9c69477c158b1f5dcdcbb4be
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 9f6512d03e85..bc7070744a26 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17rc1",
-    "2cf120e7393ca5f537c6a38b457585576dc035fc"
+    "7b3e52c75ca384de9c69477c158b1f5dcdcbb4be"
   ],
   "v16": [
     "16.4",
-    "1d7081a3b076ddf5086e0b118d4329820e6a7427"
+    "e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
   ],
   "v15": [
     "15.8",
-    "16c3c6b64f1420a367a2a9b2510f20d94f791af8"
+    "22e580fe9ffcea7e02592110b1c9bf426d83cada"
   ],
   "v14": [
     "14.13",
-    "a38d15f3233a4c07f2bf3335fcbd874dd1f4e386"
+    "2199b83fb72680001ce0f43bf6187a21dfb8f45d"
   ]
 }

From e7e6319e209cb0d90a7f0657e2fd7af5711cfab1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 24 Sep 2024 00:31:32 +0300
Subject: [PATCH 42/77] Fix compiler warnings with nightly rustc about elided
 lifetimes having names (#9105)

The warnings:

    warning: elided lifetime has a name
        --> pageserver/src/metrics.rs:1386:29
         |
    1382 |     pub(crate) fn start_timer<'c: 'a, 'a>(
| -- lifetime `'a` declared here
    ...
    1386 |     ) -> Option<impl Drop + '_> {
| ^^ this elided lifetime gets resolved as `'a`
         |
         = note: `#[warn(elided_named_lifetimes)]` on by default

    warning: elided lifetime has a name
        --> pageserver/src/metrics.rs:1537:46
         |
    1534 |     pub(crate) fn start_recording<'c: 'a, 'a>(
| -- lifetime `'a` declared here
    ...
    1537 |     ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
| ^^ this elided lifetime gets resolved as `'a`

    warning: elided lifetime has a name
        --> pageserver/src/metrics.rs:1537:50
         |
    1534 |     pub(crate) fn start_recording<'c: 'a, 'a>(
| -- lifetime `'a` declared here
    ...
    1537 |     ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
| ^^ this elided lifetime gets resolved as `'a`

    warning: elided lifetime has a name
        --> pageserver/src/tenant.rs:3630:25
         |
    3622 |     async fn prepare_new_timeline<'a>(
| -- lifetime `'a` declared here
    ...
    3630 |     ) -> anyhow::Result<UninitializedTimeline> {
| ^^^^^^^^^^^^^^^^^^^^^ this elided lifetime gets resolved as `'a`
---
 pageserver/src/metrics.rs | 4 ++--
 pageserver/src/tenant.rs  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 078d12f9342a..162e8d1836ff 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1383,7 +1383,7 @@ impl SmgrQueryTimePerTimeline {
         &'a self,
         op: SmgrQueryType,
         ctx: &'c RequestContext,
-    ) -> Option<impl Drop + '_> {
+    ) -> Option<impl Drop + 'a> {
         let start = Instant::now();
 
         self.global_started[op as usize].inc();
@@ -1534,7 +1534,7 @@ impl BasebackupQueryTime {
     pub(crate) fn start_recording<'c: 'a, 'a>(
         &'a self,
         ctx: &'c RequestContext,
-    ) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
+    ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
         let start = Instant::now();
         match ctx.micros_spent_throttled.open() {
             Ok(()) => (),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index be69f3d67f5d..5ed63734f494 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3627,7 +3627,7 @@ impl Tenant {
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
         last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> anyhow::Result<UninitializedTimeline> {
+    ) -> anyhow::Result<UninitializedTimeline<'a>> {
         let tenant_shard_id = self.tenant_shard_id;
 
         let resources = self.build_timeline_resources(new_timeline_id);

From 3a110e45ed01d553e3f9229136ef969e1efb5adc Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 20:27:09 +0300
Subject: [PATCH 43/77] Move files related to building compute image into
 compute/ dir

Seems nice to keep all these together. This also provides a nice place
for a README file to describe the compute image build process. For
now, it briefly describes the contents of the directory, but can be
expanded.
---
 .dockerignore                                  |  1 +
 .github/workflows/build_and_test.yml           |  8 ++++----
 .github/workflows/trigger-e2e-tests.yml        |  2 +-
 .../Dockerfile.compute-node                    |  0
 compute/README.md                              | 18 ++++++++++++++++++
 {patches => compute/patches}/pg_anon.patch     |  0
 {patches => compute/patches}/pg_cron.patch     |  0
 .../patches}/pg_hint_plan.patch                |  0
 {patches => compute/patches}/pgvector.patch    |  0
 {patches => compute/patches}/rum.patch         |  0
 .../vm-image-spec.yaml                         |  0
 11 files changed, 24 insertions(+), 5 deletions(-)
 rename Dockerfile.compute-node => compute/Dockerfile.compute-node (100%)
 create mode 100644 compute/README.md
 rename {patches => compute/patches}/pg_anon.patch (100%)
 rename {patches => compute/patches}/pg_cron.patch (100%)
 rename {patches => compute/patches}/pg_hint_plan.patch (100%)
 rename {patches => compute/patches}/pgvector.patch (100%)
 rename {patches => compute/patches}/rum.patch (100%)
 rename vm-image-spec.yaml => compute/vm-image-spec.yaml (100%)

diff --git a/.dockerignore b/.dockerignore
index c7a2f78e32b9..3c4a748cf7e6 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -13,6 +13,7 @@
 # Directories
 !.cargo/
 !.config/
+!compute/
 !compute_tools/
 !control_plane/
 !libs/
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f36dbfb1f061..a634edb96b87 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -651,7 +651,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
           cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
           tags: |
@@ -670,7 +670,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
           target: neon-pg-ext-test
           cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
@@ -691,7 +691,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: Dockerfile.compute-node
+          file: compute/Dockerfile.compute-node
           tags: |
             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
@@ -779,7 +779,7 @@ jobs:
       - name: Build vm image
         run: |
           ./vm-builder \
-            -spec=vm-image-spec.yaml \
+            -spec=compute/vm-image-spec.yaml \
             -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
             -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index b299cf9b9929..f25c1051cd98 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -107,7 +107,7 @@ jobs:
           if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
             for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
               case "$f" in
-                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
+                vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
                   platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                   ;;
                 *)
diff --git a/Dockerfile.compute-node b/compute/Dockerfile.compute-node
similarity index 100%
rename from Dockerfile.compute-node
rename to compute/Dockerfile.compute-node
diff --git a/compute/README.md b/compute/README.md
new file mode 100644
index 000000000000..981d290fc0f0
--- /dev/null
+++ b/compute/README.md
@@ -0,0 +1,18 @@
+This directory contains files that are needed to build the compute
+images, or included in the compute images.
+
+Dockerfile.compute-node
+	To build the compute image
+
+vm-image-spec.yaml
+	Instructions for vm-builder, to turn the compute-node image into
+	corresponding vm-compute-node image.
+
+patches/
+	Some extensions need to be patched to work with Neon. This
+	directory contains such patches. They are applied to the extension
+	sources in Dockerfile.compute-node
+
+In addition to these, postgres itself, the neon postgres extension,
+and compute_ctl are built and copied into the compute image by
+Dockerfile.compute-node.
diff --git a/patches/pg_anon.patch b/compute/patches/pg_anon.patch
similarity index 100%
rename from patches/pg_anon.patch
rename to compute/patches/pg_anon.patch
diff --git a/patches/pg_cron.patch b/compute/patches/pg_cron.patch
similarity index 100%
rename from patches/pg_cron.patch
rename to compute/patches/pg_cron.patch
diff --git a/patches/pg_hint_plan.patch b/compute/patches/pg_hint_plan.patch
similarity index 100%
rename from patches/pg_hint_plan.patch
rename to compute/patches/pg_hint_plan.patch
diff --git a/patches/pgvector.patch b/compute/patches/pgvector.patch
similarity index 100%
rename from patches/pgvector.patch
rename to compute/patches/pgvector.patch
diff --git a/patches/rum.patch b/compute/patches/rum.patch
similarity index 100%
rename from patches/rum.patch
rename to compute/patches/rum.patch
diff --git a/vm-image-spec.yaml b/compute/vm-image-spec.yaml
similarity index 100%
rename from vm-image-spec.yaml
rename to compute/vm-image-spec.yaml

From 3ad567290c99b48a3293ed3f609a701375541382 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 20:27:38 +0300
Subject: [PATCH 44/77] Move metric exporter and pgbouncer config files

Instead of adding them to the VM image late in the build process, when
putting together the final VM image, include them in the earlier
compute image already. That makes it more convenient to edit the
files, and to test them.
---
 compute/Dockerfile.compute-node            |  61 ++-
 compute/README.md                          |   3 +
 compute/etc/neon_collector.yml             | 247 ++++++++++++
 compute/etc/neon_collector_autoscaling.yml |  55 +++
 compute/etc/pgbouncer.ini                  |  17 +
 compute/etc/sql_exporter.yml               |  33 ++
 compute/etc/sql_exporter_autoscaling.yml   |  33 ++
 compute/vm-image-spec.yaml                 | 440 +--------------------
 8 files changed, 444 insertions(+), 445 deletions(-)
 create mode 100644 compute/etc/neon_collector.yml
 create mode 100644 compute/etc/neon_collector_autoscaling.yml
 create mode 100644 compute/etc/pgbouncer.ini
 create mode 100644 compute/etc/sql_exporter.yml
 create mode 100644 compute/etc/sql_exporter_autoscaling.yml

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 6bf6fb650f41..18c68c116a94 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -280,7 +280,7 @@ FROM build-deps AS vector-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY patches/pgvector.patch /pgvector.patch
+COPY compute/patches/pgvector.patch /pgvector.patch
 
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
@@ -366,7 +366,7 @@ FROM build-deps AS rum-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-COPY patches/rum.patch /rum.patch
+COPY compute/patches/rum.patch /rum.patch
 
 RUN case "${PG_VERSION}" in "v17") \
     echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -1031,6 +1031,41 @@ FROM debian:bullseye-slim AS compute-tools-image
 
 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
+#########################################################################################
+#
+# Layer "pgbouncer"
+#
+#########################################################################################
+
+FROM debian:bullseye-slim AS pgbouncer
+RUN set -e \
+    && apt-get update \
+    && apt-get install -y \
+        build-essential \
+        git \
+        libevent-dev \
+        libtool \
+        pkg-config
+
+# Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
+ENV PGBOUNCER_TAG=pgbouncer_1_22_1
+RUN set -e \
+    && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
+    && cd pgbouncer \
+    && ./autogen.sh \
+    && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
+    && make -j $(nproc) dist_man_MANS= \
+    && make install dist_man_MANS=
+
+#########################################################################################
+#
+# Layers "postgres-exporter" and "sql-exporter"
+#
+#########################################################################################
+
+FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
+FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
+
 #########################################################################################
 #
 # Clean up postgres folder before inclusion
@@ -1078,7 +1113,7 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
 COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
 COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
 COPY --from=rum-pg-build /rum.tar.gz /ext-src
-COPY patches/rum.patch /ext-src
+COPY compute/patches/rum.patch /ext-src
 #COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
 COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
 COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
@@ -1086,9 +1121,9 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY patches/pg_hint_plan.patch /ext-src
+COPY compute/patches/pg_hint_plan.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
-COPY patches/pg_cron.patch /ext-src
+COPY compute/patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
 #COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
 COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
@@ -1097,7 +1132,7 @@ COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
 COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
-COPY patches/pg_anon.patch /ext-src
+COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
 RUN case "${PG_VERSION}" in "v17") \
@@ -1160,9 +1195,23 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
+# pgbouncer and its config
+COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
+COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
+
+# Metrics exporter binaries and  configuration files
+COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
+COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
+
+COPY --chmod=0644 compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
+COPY --chmod=0644 compute/etc/neon_collector.yml             /etc/neon_collector.yml
+COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
+COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
+
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
 
+
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
diff --git a/compute/README.md b/compute/README.md
index 981d290fc0f0..bb1e42ab53b7 100644
--- a/compute/README.md
+++ b/compute/README.md
@@ -8,6 +8,9 @@ vm-image-spec.yaml
 	Instructions for vm-builder, to turn the compute-node image into
 	corresponding vm-compute-node image.
 
+etc/
+	Configuration files included in /etc in the compute image
+
 patches/
 	Some extensions need to be patched to work with Neon. This
 	directory contains such patches. They are applied to the extension
diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml
new file mode 100644
index 000000000000..29be0958dd20
--- /dev/null
+++ b/compute/etc/neon_collector.yml
@@ -0,0 +1,247 @@
+collector_name: neon_collector
+metrics:
+- metric_name: lfc_misses
+  type: gauge
+  help: 'lfc_misses'
+  key_labels:
+  values: [lfc_misses]
+  query: |
+    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+- metric_name: lfc_used
+  type: gauge
+  help: 'LFC chunks used (chunk = 1MB)'
+  key_labels:
+  values: [lfc_used]
+  query: |
+    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+- metric_name: lfc_hits
+  type: gauge
+  help: 'lfc_hits'
+  key_labels:
+  values: [lfc_hits]
+  query: |
+    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+- metric_name: lfc_writes
+  type: gauge
+  help: 'lfc_writes'
+  key_labels:
+  values: [lfc_writes]
+  query: |
+    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+- metric_name: lfc_cache_size_limit
+  type: gauge
+  help: 'LFC cache size limit in bytes'
+  key_labels:
+  values: [lfc_cache_size_limit]
+  query: |
+    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+- metric_name: connection_counts
+  type: gauge
+  help: 'Connection counts'
+  key_labels:
+    - datname
+    - state
+  values: [count]
+  query: |
+    select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
+
+- metric_name: pg_stats_userdb
+  type: gauge
+  help: 'Stats for several oldest non-system dbs'
+  key_labels:
+    - datname
+  value_label: kind
+  values:
+    - db_size
+    - deadlocks
+    # Rows
+    - inserted
+    - updated
+    - deleted
+  # We export stats for 10 non-system database. Without this limit
+  # it is too easy to abuse the system by creating lots of databases.
+  query: |
+    select pg_database_size(datname) as db_size, deadlocks,
+       tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
+       datname
+     from pg_stat_database
+     where datname IN (
+       select datname
+       from pg_database
+       where datname <> 'postgres' and not datistemplate
+       order by oid
+       limit 10
+     );
+
+- metric_name: max_cluster_size
+  type: gauge
+  help: 'neon.max_cluster_size setting'
+  key_labels:
+  values: [max_cluster_size]
+  query: |
+    select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
+
+- metric_name: db_total_size
+  type: gauge
+  help: 'Size of all databases'
+  key_labels:
+  values: [total]
+  query: |
+    select sum(pg_database_size(datname)) as total from pg_database;
+
+# DEPRECATED
+- metric_name: lfc_approximate_working_set_size
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels:
+  values: [approximate_working_set_size]
+  query: |
+    select neon.approximate_working_set_size(false) as approximate_working_set_size;
+
+- metric_name: lfc_approximate_working_set_size_windows
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels: [duration]
+  values: [size]
+  # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
+  # of durations in a pretty-printed form.
+  query: |
+    select
+      x as duration,
+      neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
+    from
+      (values ('5m'),('15m'),('1h')) as t (x);
+
+- metric_name: compute_current_lsn
+  type: gauge
+  help: 'Current LSN of the database'
+  key_labels:
+  values: [lsn]
+  query: |
+    select
+      case
+        when pg_catalog.pg_is_in_recovery()
+        then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+        else (pg_current_wal_lsn() - '0/0')::FLOAT8
+      end as lsn;
+
+- metric_name: compute_receive_lsn
+  type: gauge
+  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
+  key_labels:
+  values: [lsn]
+  query: |
+    SELECT
+      CASE
+        WHEN pg_catalog.pg_is_in_recovery()
+        THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+        ELSE 0
+      END AS lsn;
+
+- metric_name: replication_delay_bytes
+  type: gauge
+  help: 'Bytes between received and replayed LSN'
+  key_labels:
+  values: [replication_delay_bytes]
+  # We use a GREATEST call here because this calculation can be negative.
+  # The calculation is not atomic, meaning after we've gotten the receive
+  # LSN, the replay LSN may have advanced past the receive LSN we
+  # are using for the calculation.
+  query: |
+    SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
+
+- metric_name: replication_delay_seconds
+  type: gauge
+  help: 'Time since last LSN was replayed'
+  key_labels:
+  values: [replication_delay_seconds]
+  query: |
+    SELECT
+      CASE
+        WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+        ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+     END AS replication_delay_seconds;
+
+- metric_name: checkpoints_req
+  type: gauge
+  help: 'Number of requested checkpoints'
+  key_labels:
+  values: [checkpoints_req]
+  query: |
+    SELECT checkpoints_req FROM pg_stat_bgwriter;
+
+- metric_name: checkpoints_timed
+  type: gauge
+  help: 'Number of scheduled checkpoints'
+  key_labels:
+  values: [checkpoints_timed]
+  query: |
+    SELECT checkpoints_timed FROM pg_stat_bgwriter;
+
+- metric_name: compute_logical_snapshot_files
+  type: gauge
+  help: 'Number of snapshot files in pg_logical/snapshot'
+  key_labels:
+    - timeline_id
+  values: [num_logical_snapshot_files]
+  query: |
+    SELECT
+      (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+      -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
+      -- temporary snapshot files are renamed to the actual snapshot files after they are
+      -- completely built. We only WAL-log the completely built snapshot files.
+      (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+
+# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
+# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
+
+# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
+- metric_name: logical_slot_restart_lsn
+  type: gauge
+  help: 'restart_lsn of logical slots'
+  key_labels:
+    - slot_name
+  values: [restart_lsn]
+  query: |
+    select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
+    from pg_replication_slots
+    where slot_type = 'logical';
+
+- metric_name: compute_subscriptions_count
+  type: gauge
+  help: 'Number of logical replication subscriptions grouped by enabled/disabled'
+  key_labels:
+    - enabled
+  values: [subscriptions_count]
+  query: |
+    select subenabled::text as enabled, count(*) as subscriptions_count
+    from pg_subscription
+    group by subenabled;
+
+- metric_name: retained_wal
+  type: gauge
+  help: 'Retained WAL in inactive replication slots'
+  key_labels:
+    - slot_name
+  values: [retained_wal]
+  query: |
+    SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+    FROM pg_replication_slots
+    WHERE active = false;
+
+- metric_name: wal_is_lost
+  type: gauge
+  help: 'Whether or not the replication slot wal_status is lost'
+  key_labels:
+    - slot_name
+  values: [wal_is_lost]
+  query: |
+    SELECT slot_name,
+           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
+    FROM pg_replication_slots;
+
diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml
new file mode 100644
index 000000000000..5616264eba1b
--- /dev/null
+++ b/compute/etc/neon_collector_autoscaling.yml
@@ -0,0 +1,55 @@
+collector_name: neon_collector_autoscaling
+metrics:
+- metric_name: lfc_misses
+  type: gauge
+  help: 'lfc_misses'
+  key_labels:
+  values: [lfc_misses]
+  query: |
+    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
+
+- metric_name: lfc_used
+  type: gauge
+  help: 'LFC chunks used (chunk = 1MB)'
+  key_labels:
+  values: [lfc_used]
+  query: |
+    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
+
+- metric_name: lfc_hits
+  type: gauge
+  help: 'lfc_hits'
+  key_labels:
+  values: [lfc_hits]
+  query: |
+    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
+
+- metric_name: lfc_writes
+  type: gauge
+  help: 'lfc_writes'
+  key_labels:
+  values: [lfc_writes]
+  query: |
+    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
+
+- metric_name: lfc_cache_size_limit
+  type: gauge
+  help: 'LFC cache size limit in bytes'
+  key_labels:
+  values: [lfc_cache_size_limit]
+  query: |
+    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
+
+- metric_name: lfc_approximate_working_set_size_windows
+  type: gauge
+  help: 'Approximate working set size in pages of 8192 bytes'
+  key_labels: [duration_seconds]
+  values: [size]
+  # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
+  # size looking back 1..60 minutes, labeled with the number of minutes.
+  query: |
+    select
+      x::text as duration_seconds,
+      neon.approximate_working_set_size_seconds(x) as size
+    from
+      (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini
new file mode 100644
index 000000000000..cb994f961c87
--- /dev/null
+++ b/compute/etc/pgbouncer.ini
@@ -0,0 +1,17 @@
+[databases]
+*=host=localhost port=5432 auth_user=cloud_admin
+[pgbouncer]
+listen_port=6432
+listen_addr=0.0.0.0
+auth_type=scram-sha-256
+auth_user=cloud_admin
+auth_dbname=postgres
+client_tls_sslmode=disable
+server_tls_sslmode=disable
+pool_mode=transaction
+max_client_conn=10000
+default_pool_size=64
+max_prepared_statements=0
+admin_users=postgres
+unix_socket_dir=/tmp/
+unix_socket_mode=0777
diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml
new file mode 100644
index 000000000000..139d04468ab6
--- /dev/null
+++ b/compute/etc/sql_exporter.yml
@@ -0,0 +1,33 @@
+# Configuration for sql_exporter
+# Global defaults.
+global:
+  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+  scrape_timeout: 10s
+  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+  scrape_timeout_offset: 500ms
+  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+  min_interval: 0s
+  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+  # as will concurrent scrapes.
+  max_connections: 1
+  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+  # always be the same as max_connections.
+  max_idle_connections: 1
+  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+  # If 0, connections are not closed due to a connection's age.
+  max_connection_lifetime: 5m
+
+# The target to monitor and the collectors to execute on it.
+target:
+  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+  # the schema gets dropped or replaced to match the driver expected DSN format.
+  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
+
+  # Collectors (referenced by name) to execute on the target.
+  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collectors: [neon_collector]
+
+# Collector files specifies a list of globs. One collector definition is read from each matching file.
+# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+collector_files:
+  - "neon_collector.yml"
diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml
new file mode 100644
index 000000000000..044557233ee3
--- /dev/null
+++ b/compute/etc/sql_exporter_autoscaling.yml
@@ -0,0 +1,33 @@
+# Configuration for sql_exporter for autoscaling-agent
+# Global defaults.
+global:
+  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+  scrape_timeout: 10s
+  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+  scrape_timeout_offset: 500ms
+  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+  min_interval: 0s
+  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+  # as will concurrent scrapes.
+  max_connections: 1
+  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+  # always be the same as max_connections.
+  max_idle_connections: 1
+  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+  # If 0, connections are not closed due to a connection's age.
+  max_connection_lifetime: 5m
+
+# The target to monitor and the collectors to execute on it.
+target:
+  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+  # the schema gets dropped or replaced to match the driver expected DSN format.
+  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
+
+  # Collectors (referenced by name) to execute on the target.
+  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collectors: [neon_collector_autoscaling]
+
+# Collector files specifies a list of globs. One collector definition is read from each matching file.
+# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+collector_files:
+  - "neon_collector_autoscaling.yml"
diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec.yaml
index c94f95f447de..0af44745e5ef 100644
--- a/compute/vm-image-spec.yaml
+++ b/compute/vm-image-spec.yaml
@@ -35,25 +35,6 @@ files:
       # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
       # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
       postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
-  - filename: pgbouncer.ini
-    content: |
-      [databases]
-      *=host=localhost port=5432 auth_user=cloud_admin
-      [pgbouncer]
-      listen_port=6432
-      listen_addr=0.0.0.0
-      auth_type=scram-sha-256
-      auth_user=cloud_admin
-      auth_dbname=postgres
-      client_tls_sslmode=disable
-      server_tls_sslmode=disable
-      pool_mode=transaction
-      max_client_conn=10000
-      default_pool_size=64
-      max_prepared_statements=0
-      admin_users=postgres
-      unix_socket_dir=/tmp/
-      unix_socket_mode=0777
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
@@ -68,385 +49,6 @@ files:
           }
           memory {}
       }
-  - filename: sql_exporter.yml
-    content: |
-      # Configuration for sql_exporter
-      # Global defaults.
-      global:
-        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-        scrape_timeout: 10s
-        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-        scrape_timeout_offset: 500ms
-        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-        min_interval: 0s
-        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-        # as will concurrent scrapes.
-        max_connections: 1
-        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-        # always be the same as max_connections.
-        max_idle_connections: 1
-        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-        # If 0, connections are not closed due to a connection's age.
-        max_connection_lifetime: 5m
-
-      # The target to monitor and the collectors to execute on it.
-      target:
-        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
-
-        # Collectors (referenced by name) to execute on the target.
-        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-        collectors: [neon_collector]
-
-      # Collector files specifies a list of globs. One collector definition is read from each matching file.
-      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-      collector_files:
-        - "neon_collector.yml"
-  - filename: sql_exporter_autoscaling.yml
-    content: |
-      # Configuration for sql_exporter for autoscaling-agent
-      # Global defaults.
-      global:
-        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-        scrape_timeout: 10s
-        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-        scrape_timeout_offset: 500ms
-        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-        min_interval: 0s
-        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-        # as will concurrent scrapes.
-        max_connections: 1
-        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-        # always be the same as max_connections.
-        max_idle_connections: 1
-        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-        # If 0, connections are not closed due to a connection's age.
-        max_connection_lifetime: 5m
-
-      # The target to monitor and the collectors to execute on it.
-      target:
-        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
-
-        # Collectors (referenced by name) to execute on the target.
-        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-        collectors: [neon_collector_autoscaling]
-
-      # Collector files specifies a list of globs. One collector definition is read from each matching file.
-      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-      collector_files:
-        - "neon_collector_autoscaling.yml"
-  - filename: neon_collector.yml
-    content: |
-      collector_name: neon_collector
-      metrics:
-      - metric_name: lfc_misses
-        type: gauge
-        help: 'lfc_misses'
-        key_labels:
-        values: [lfc_misses]
-        query: |
-          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-      - metric_name: lfc_used
-        type: gauge
-        help: 'LFC chunks used (chunk = 1MB)'
-        key_labels:
-        values: [lfc_used]
-        query: |
-          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-      - metric_name: lfc_hits
-        type: gauge
-        help: 'lfc_hits'
-        key_labels:
-        values: [lfc_hits]
-        query: |
-          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-      - metric_name: lfc_writes
-        type: gauge
-        help: 'lfc_writes'
-        key_labels:
-        values: [lfc_writes]
-        query: |
-          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-      - metric_name: lfc_cache_size_limit
-        type: gauge
-        help: 'LFC cache size limit in bytes'
-        key_labels:
-        values: [lfc_cache_size_limit]
-        query: |
-          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-      - metric_name: connection_counts
-        type: gauge
-        help: 'Connection counts'
-        key_labels:
-          - datname
-          - state
-        values: [count]
-        query: |
-          select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
-
-      - metric_name: pg_stats_userdb
-        type: gauge
-        help: 'Stats for several oldest non-system dbs'
-        key_labels:
-          - datname
-        value_label: kind
-        values:
-          - db_size
-          - deadlocks
-          # Rows
-          - inserted
-          - updated
-          - deleted
-        # We export stats for 10 non-system database. Without this limit
-        # it is too easy to abuse the system by creating lots of databases.
-        query: |
-          select pg_database_size(datname) as db_size, deadlocks,
-                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
-                 datname
-            from pg_stat_database
-           where datname IN (
-             select datname
-               from pg_database
-              where datname <> 'postgres' and not datistemplate
-              order by oid
-              limit 10
-           );
-
-      - metric_name: max_cluster_size
-        type: gauge
-        help: 'neon.max_cluster_size setting'
-        key_labels:
-        values: [max_cluster_size]
-        query: |
-          select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
-
-      - metric_name: db_total_size
-        type: gauge
-        help: 'Size of all databases'
-        key_labels:
-        values: [total]
-        query: |
-          select sum(pg_database_size(datname)) as total from pg_database;
-
-      # DEPRECATED
-      - metric_name: lfc_approximate_working_set_size
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels:
-        values: [approximate_working_set_size]
-        query: |
-          select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
-      - metric_name: lfc_approximate_working_set_size_windows
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels: [duration]
-        values: [size]
-        # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
-        # of durations in a pretty-printed form.
-        query: |
-          select
-            x as duration,
-            neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
-          from
-            (values ('5m'),('15m'),('1h')) as t (x);
-
-      - metric_name: compute_current_lsn
-        type: gauge
-        help: 'Current LSN of the database'
-        key_labels:
-        values: [lsn]
-        query: |
-          select
-            case
-              when pg_catalog.pg_is_in_recovery()
-              then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
-              else (pg_current_wal_lsn() - '0/0')::FLOAT8
-            end as lsn;
-
-      - metric_name: compute_receive_lsn
-        type: gauge
-        help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
-        key_labels:
-        values: [lsn]
-        query: |
-          SELECT
-            CASE
-              WHEN pg_catalog.pg_is_in_recovery()
-              THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
-              ELSE 0
-            END AS lsn;
-
-      - metric_name: replication_delay_bytes
-        type: gauge
-        help: 'Bytes between received and replayed LSN'
-        key_labels:
-        values: [replication_delay_bytes]
-        # We use a GREATEST call here because this calculation can be negative.
-        # The calculation is not atomic, meaning after we've gotten the receive
-        # LSN, the replay LSN may have advanced past the receive LSN we
-        # are using for the calculation.
-        query: |
-          SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
-
-      - metric_name: replication_delay_seconds
-        type: gauge
-        help: 'Time since last LSN was replayed'
-        key_labels:
-        values: [replication_delay_seconds]
-        query: |
-          SELECT
-            CASE
-              WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
-              ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
-            END AS replication_delay_seconds;
-
-      - metric_name: checkpoints_req
-        type: gauge
-        help: 'Number of requested checkpoints'
-        key_labels:
-        values: [checkpoints_req]
-        query: |
-          SELECT checkpoints_req FROM pg_stat_bgwriter;
-
-      - metric_name: checkpoints_timed
-        type: gauge
-        help: 'Number of scheduled checkpoints'
-        key_labels:
-        values: [checkpoints_timed]
-        query: |
-          SELECT checkpoints_timed FROM pg_stat_bgwriter;
-
-      - metric_name: compute_logical_snapshot_files
-        type: gauge
-        help: 'Number of snapshot files in pg_logical/snapshot'
-        key_labels:
-          - timeline_id
-        values: [num_logical_snapshot_files]
-        query: |
-          SELECT
-            (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-            -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
-            -- temporary snapshot files are renamed to the actual snapshot files after they are
-            -- completely built. We only WAL-log the completely built snapshot files.
-            (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
-
-      # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
-      # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
-
-      # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
-      - metric_name: logical_slot_restart_lsn
-        type: gauge
-        help: 'restart_lsn of logical slots'
-        key_labels:
-          - slot_name
-        values: [restart_lsn]
-        query: |
-          select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
-          from pg_replication_slots
-          where slot_type = 'logical';
-
-      - metric_name: compute_subscriptions_count
-        type: gauge
-        help: 'Number of logical replication subscriptions grouped by enabled/disabled'
-        key_labels:
-          - enabled
-        values: [subscriptions_count]
-        query: |
-          select subenabled::text as enabled, count(*) as subscriptions_count
-          from pg_subscription
-          group by subenabled;
-
-      - metric_name: retained_wal
-        type: gauge
-        help: 'Retained WAL in inactive replication slots'
-        key_labels:
-          - slot_name
-        values: [retained_wal]
-        query: |
-          SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
-          FROM pg_replication_slots
-          WHERE active = false;
-
-      - metric_name: wal_is_lost
-        type: gauge
-        help: 'Whether or not the replication slot wal_status is lost'
-        key_labels:
-          - slot_name
-        values: [wal_is_lost]
-        query: |
-          SELECT slot_name,
-          CASE
-            WHEN wal_status = 'lost' THEN 1
-            ELSE 0
-          END AS wal_is_lost
-          FROM pg_replication_slots;
-
-  - filename: neon_collector_autoscaling.yml
-    content: |
-      collector_name: neon_collector_autoscaling
-      metrics:
-      - metric_name: lfc_misses
-        type: gauge
-        help: 'lfc_misses'
-        key_labels:
-        values: [lfc_misses]
-        query: |
-          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-      - metric_name: lfc_used
-        type: gauge
-        help: 'LFC chunks used (chunk = 1MB)'
-        key_labels:
-        values: [lfc_used]
-        query: |
-          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-      - metric_name: lfc_hits
-        type: gauge
-        help: 'lfc_hits'
-        key_labels:
-        values: [lfc_hits]
-        query: |
-          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-      - metric_name: lfc_writes
-        type: gauge
-        help: 'lfc_writes'
-        key_labels:
-        values: [lfc_writes]
-        query: |
-          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-      - metric_name: lfc_cache_size_limit
-        type: gauge
-        help: 'LFC cache size limit in bytes'
-        key_labels:
-        values: [lfc_cache_size_limit]
-        query: |
-          select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-      - metric_name: lfc_approximate_working_set_size_windows
-        type: gauge
-        help: 'Approximate working set size in pages of 8192 bytes'
-        key_labels: [duration_seconds]
-        values: [size]
-        # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
-        # size looking back 1..60 minutes, labeled with the number of minutes.
-        query: |
-          select
-            x::text as duration_seconds,
-            neon.approximate_working_set_size_seconds(x) as size
-          from
-            (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
 build: |
   # Build cgroup-tools
   #
@@ -480,32 +82,6 @@ build: |
       && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
       # actually build the thing...
       && make install
-
-  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
-
-  FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
-
-  # Build pgbouncer
-  #
-  FROM debian:bullseye-slim AS pgbouncer
-  RUN set -e \
-      && apt-get update \
-      && apt-get install -y \
-          build-essential \
-          git \
-          libevent-dev \
-          libtool \
-          pkg-config
-
-  # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-  ENV PGBOUNCER_TAG=pgbouncer_1_22_1
-  RUN set -e \
-      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
-      && cd pgbouncer \
-      && ./autogen.sh \
-      && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
-      && make -j $(nproc) dist_man_MANS= \
-      && make install dist_man_MANS=
 merge: |
   # tweak nofile limits
   RUN set -e \
@@ -527,24 +103,10 @@ merge: |
   COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
 
   COPY cgconfig.conf /etc/cgconfig.conf
-  COPY pgbouncer.ini /etc/pgbouncer.ini
-  COPY sql_exporter.yml /etc/sql_exporter.yml
-  COPY neon_collector.yml /etc/neon_collector.yml
-  COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
-  COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
 
   RUN set -e \
-      && chown postgres:postgres /etc/pgbouncer.ini \
-      && chmod 0666 /etc/pgbouncer.ini \
-      && chmod 0644 /etc/cgconfig.conf \
-      && chmod 0644 /etc/sql_exporter.yml \
-      && chmod 0644 /etc/neon_collector.yml \
-      && chmod 0644 /etc/sql_exporter_autoscaling.yml \
-      && chmod 0644 /etc/neon_collector_autoscaling.yml
+      && chmod 0644 /etc/cgconfig.conf
 
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
   COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
-  COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
-  COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
-  COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer

From 37aa6fd953285da7480cf23ab1ddfd2f6958b55e Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 23 Sep 2024 17:58:12 -0400
Subject: [PATCH 45/77] scrubber: retry when missing index key in the listing
 (#8873)

Part of #8128, fixes #8872.

## Problem

See #8872.

## Summary of changes

- Retry `list_timeline_blobs` another time if
  - there are layer file keys listed but not index.
  - failed to download index.
- Instrument code with `analyze-tenant` and `analyze-timeline` span.
- Remove `initdb_archive` check, it could have been deleted.
- Return with exit code 1 on fatal error if `--exit-code` parameter is set.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 storage_scrubber/src/checks.rs                | 133 +++++++++++++-----
 storage_scrubber/src/main.rs                  |  15 ++
 .../src/scan_pageserver_metadata.rs           |  88 +++++++-----
 3 files changed, 164 insertions(+), 72 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index de6918b3da26..525f412b5660 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -1,13 +1,12 @@
 use std::collections::{HashMap, HashSet};
 
-use anyhow::Context;
 use itertools::Itertools;
 use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn};
+use tracing::{info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
 
@@ -29,9 +28,8 @@ pub(crate) struct TimelineAnalysis {
     /// yet.
     pub(crate) warnings: Vec<String>,
 
-    /// Keys not referenced in metadata: candidates for removal, but NOT NECESSARILY: beware
-    /// of races between reading the metadata and reading the objects.
-    pub(crate) garbage_keys: Vec<String>,
+    /// Objects whose keys were not recognized at all, i.e. not layer files, not indices, and not initdb archive.
+    pub(crate) unknown_keys: Vec<String>,
 }
 
 impl TimelineAnalysis {
@@ -39,7 +37,7 @@ impl TimelineAnalysis {
         Self {
             errors: Vec::new(),
             warnings: Vec::new(),
-            garbage_keys: Vec::new(),
+            unknown_keys: Vec::new(),
         }
     }
 
@@ -59,7 +57,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
 ) -> TimelineAnalysis {
     let mut result = TimelineAnalysis::new();
 
-    info!("Checking timeline {id}");
+    info!("Checking timeline");
 
     if let Some(s3_active_branch) = s3_active_branch {
         info!(
@@ -80,7 +78,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
     match s3_data {
         Some(s3_data) => {
             result
-                .garbage_keys
+                .unknown_keys
                 .extend(s3_data.unknown_keys.into_iter().map(|k| k.key.to_string()));
 
             match s3_data.blob_data {
@@ -204,10 +202,10 @@ pub(crate) async fn branch_cleanup_and_check_errors(
         warn!("Timeline metadata warnings: {0:?}", result.warnings);
     }
 
-    if !result.garbage_keys.is_empty() {
-        error!(
-            "The following keys should be removed from S3: {0:?}",
-            result.garbage_keys
+    if !result.unknown_keys.is_empty() {
+        warn!(
+            "The following keys are not recognized: {0:?}",
+            result.unknown_keys
         )
     }
 
@@ -294,10 +292,10 @@ impl TenantObjectListing {
 pub(crate) struct RemoteTimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
 
-    // Index objects that were not used when loading `blob_data`, e.g. those from old generations
+    /// Index objects that were not used when loading `blob_data`, e.g. those from old generations
     pub(crate) unused_index_keys: Vec<ListingObject>,
 
-    // Objects whose keys were not recognized at all, i.e. not layer files, not indices
+    /// Objects whose keys were not recognized at all, i.e. not layer files, not indices
     pub(crate) unknown_keys: Vec<ListingObject>,
 }
 
@@ -329,11 +327,54 @@ pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generati
     }
 }
 
+/// Note (<https://github.com/neondatabase/neon/issues/8872>):
+/// Since we do not gurantee the order of the listing, we could list layer keys right before
+/// pageserver `RemoteTimelineClient` deletes the layer files and then the index.
+/// In the rare case, this would give back a transient error where the index key is missing.
+///
+/// To avoid generating false positive, we try streaming the listing for a second time.
 pub(crate) async fn list_timeline_blobs(
     remote_client: &GenericRemoteStorage,
     id: TenantShardTimelineId,
     root_target: &RootTarget,
 ) -> anyhow::Result<RemoteTimelineBlobData> {
+    let res = list_timeline_blobs_impl(remote_client, id, root_target).await?;
+    match res {
+        ListTimelineBlobsResult::Ready(data) => Ok(data),
+        ListTimelineBlobsResult::MissingIndexPart(_) => {
+            // Retry if index is missing.
+            let data = list_timeline_blobs_impl(remote_client, id, root_target)
+                .await?
+                .into_data();
+            Ok(data)
+        }
+    }
+}
+
+enum ListTimelineBlobsResult {
+    /// Blob data is ready to be intepreted.
+    Ready(RemoteTimelineBlobData),
+    /// List timeline blobs has layer files but is missing [`IndexPart`].
+    MissingIndexPart(RemoteTimelineBlobData),
+}
+
+impl ListTimelineBlobsResult {
+    /// Get the inner blob data regardless the status.
+    pub fn into_data(self) -> RemoteTimelineBlobData {
+        match self {
+            ListTimelineBlobsResult::Ready(data) => data,
+            ListTimelineBlobsResult::MissingIndexPart(data) => data,
+        }
+    }
+}
+
+/// Returns [`ListTimelineBlobsResult::MissingIndexPart`] if blob data has layer files
+/// but is missing [`IndexPart`], otherwise returns [`ListTimelineBlobsResult::Ready`].
+async fn list_timeline_blobs_impl(
+    remote_client: &GenericRemoteStorage,
+    id: TenantShardTimelineId,
+    root_target: &RootTarget,
+) -> anyhow::Result<ListTimelineBlobsResult> {
     let mut s3_layers = HashSet::new();
 
     let mut errors = Vec::new();
@@ -375,30 +416,28 @@ pub(crate) async fn list_timeline_blobs(
                     s3_layers.insert((new_layer, gen));
                 }
                 Err(e) => {
-                    tracing::info!("Error parsing key {maybe_layer_name}");
-                    errors.push(
-                        format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
-                    );
+                    tracing::info!("Error parsing {maybe_layer_name} as layer name: {e}");
                     unknown_keys.push(obj);
                 }
             },
             None => {
-                tracing::warn!("Unknown key {key}");
-                errors.push(format!("S3 list response got an object with odd key {key}"));
+                tracing::info!("S3 listed an unknown key: {key}");
                 unknown_keys.push(obj);
             }
         }
     }
 
-    if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive {
-        tracing::debug!(
-            "Timeline is empty apart from initdb archive: expected post-deletion state."
-        );
-        return Ok(RemoteTimelineBlobData {
+    if index_part_keys.is_empty() && s3_layers.is_empty() {
+        tracing::debug!("Timeline is empty: expected post-deletion state.");
+        if initdb_archive {
+            tracing::info!("Timeline is post deletion but initdb archive is still present.");
+        }
+
+        return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
             blob_data: BlobDataParseResult::Relic,
             unused_index_keys: index_part_keys,
-            unknown_keys: Vec::new(),
-        });
+            unknown_keys,
+        }));
     }
 
     // Choose the index_part with the highest generation
@@ -424,19 +463,43 @@ pub(crate) async fn list_timeline_blobs(
     match index_part_object.as_ref() {
         Some(selected) => index_part_keys.retain(|k| k != selected),
         None => {
-            errors.push("S3 list response got no index_part.json file".to_string());
+            // It is possible that the branch gets deleted after we got some layer files listed
+            // and we no longer have the index file in the listing.
+            errors.push(
+                "S3 list response got no index_part.json file but still has layer files"
+                    .to_string(),
+            );
+            return Ok(ListTimelineBlobsResult::MissingIndexPart(
+                RemoteTimelineBlobData {
+                    blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
+                    unused_index_keys: index_part_keys,
+                    unknown_keys,
+                },
+            ));
         }
     }
 
     if let Some(index_part_object_key) = index_part_object.as_ref() {
         let index_part_bytes =
-            download_object_with_retries(remote_client, &index_part_object_key.key)
-                .await
-                .context("index_part.json download")?;
+            match download_object_with_retries(remote_client, &index_part_object_key.key).await {
+                Ok(index_part_bytes) => index_part_bytes,
+                Err(e) => {
+                    // It is possible that the branch gets deleted in-between we list the objects
+                    // and we download the index part file.
+                    errors.push(format!("failed to download index_part.json: {e}"));
+                    return Ok(ListTimelineBlobsResult::MissingIndexPart(
+                        RemoteTimelineBlobData {
+                            blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
+                            unused_index_keys: index_part_keys,
+                            unknown_keys,
+                        },
+                    ));
+                }
+            };
 
         match serde_json::from_slice(&index_part_bytes) {
             Ok(index_part) => {
-                return Ok(RemoteTimelineBlobData {
+                return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
                     blob_data: BlobDataParseResult::Parsed {
                         index_part: Box::new(index_part),
                         index_part_generation,
@@ -444,7 +507,7 @@ pub(crate) async fn list_timeline_blobs(
                     },
                     unused_index_keys: index_part_keys,
                     unknown_keys,
-                })
+                }))
             }
             Err(index_parse_error) => errors.push(format!(
                 "index_part.json body parsing error: {index_parse_error}"
@@ -458,9 +521,9 @@ pub(crate) async fn list_timeline_blobs(
         );
     }
 
-    Ok(RemoteTimelineBlobData {
+    Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData {
         blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
         unused_index_keys: index_part_keys,
         unknown_keys,
-    })
+    }))
 }
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index ee133e2e5846..ee816534c63b 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -41,6 +41,10 @@ struct Cli {
     #[arg(long)]
     /// JWT token for authenticating with storage controller.  Requires scope 'scrubber' or 'admin'.
     controller_jwt: Option<String>,
+
+    /// If set to true, the scrubber will exit with error code on fatal error.
+    #[arg(long, default_value_t = false)]
+    exit_code: bool,
 }
 
 #[derive(Subcommand, Debug)]
@@ -203,6 +207,7 @@ async fn main() -> anyhow::Result<()> {
                     tenant_ids,
                     json,
                     post_to_storcon,
+                    cli.exit_code,
                 )
                 .await
             }
@@ -269,6 +274,7 @@ async fn main() -> anyhow::Result<()> {
                 gc_min_age,
                 gc_mode,
                 post_to_storcon,
+                cli.exit_code,
             )
             .await
         }
@@ -284,6 +290,7 @@ pub async fn run_cron_job(
     gc_min_age: humantime::Duration,
     gc_mode: GcMode,
     post_to_storcon: bool,
+    exit_code: bool,
 ) -> anyhow::Result<()> {
     tracing::info!(%gc_min_age, %gc_mode, "Running pageserver-physical-gc");
     pageserver_physical_gc_cmd(
@@ -301,6 +308,7 @@ pub async fn run_cron_job(
         Vec::new(),
         true,
         post_to_storcon,
+        exit_code,
     )
     .await?;
 
@@ -349,6 +357,7 @@ pub async fn scan_pageserver_metadata_cmd(
     tenant_shard_ids: Vec<TenantShardId>,
     json: bool,
     post_to_storcon: bool,
+    exit_code: bool,
 ) -> anyhow::Result<()> {
     if controller_client.is_none() && post_to_storcon {
         return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
@@ -380,6 +389,9 @@ pub async fn scan_pageserver_metadata_cmd(
 
             if summary.is_fatal() {
                 tracing::error!("Fatal scrub errors detected");
+                if exit_code {
+                    std::process::exit(1);
+                }
             } else if summary.is_empty() {
                 // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
                 // scrubber they were likely expecting to scan something, and if we see no timelines
@@ -391,6 +403,9 @@ pub async fn scan_pageserver_metadata_cmd(
                         .prefix_in_bucket
                         .unwrap_or("<none>".to_string())
                 );
+                if exit_code {
+                    std::process::exit(1);
+                }
             }
 
             Ok(())
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index 151ef2767296..c1ea589f7f47 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -12,6 +12,7 @@ use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use serde::Serialize;
+use tracing::{info_span, Instrument};
 use utils::id::TenantId;
 use utils::shard::ShardCount;
 
@@ -169,45 +170,54 @@ pub async fn scan_pageserver_metadata(
         let mut timeline_ids = HashSet::new();
         let mut timeline_generations = HashMap::new();
         for (ttid, data) in timelines {
-            if ttid.tenant_shard_id.shard_count == highest_shard_count {
-                // Only analyze `TenantShardId`s with highest shard count.
-
-                // Stash the generation of each timeline, for later use identifying orphan layers
-                if let BlobDataParseResult::Parsed {
-                    index_part,
-                    index_part_generation,
-                    s3_layers: _s3_layers,
-                } = &data.blob_data
-                {
-                    if index_part.deleted_at.is_some() {
-                        // skip deleted timeline.
-                        tracing::info!("Skip analysis of {} b/c timeline is already deleted", ttid);
-                        continue;
+            async {
+                if ttid.tenant_shard_id.shard_count == highest_shard_count {
+                    // Only analyze `TenantShardId`s with highest shard count.
+
+                    // Stash the generation of each timeline, for later use identifying orphan layers
+                    if let BlobDataParseResult::Parsed {
+                        index_part,
+                        index_part_generation,
+                        s3_layers: _s3_layers,
+                    } = &data.blob_data
+                    {
+                        if index_part.deleted_at.is_some() {
+                            // skip deleted timeline.
+                            tracing::info!(
+                                "Skip analysis of {} b/c timeline is already deleted",
+                                ttid
+                            );
+                            return;
+                        }
+                        timeline_generations.insert(ttid, *index_part_generation);
                     }
-                    timeline_generations.insert(ttid, *index_part_generation);
-                }
 
-                // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
-                // reference counts for layers across the tenant.
-                let analysis = branch_cleanup_and_check_errors(
-                    remote_client,
-                    &ttid,
-                    &mut tenant_objects,
-                    None,
-                    None,
-                    Some(data),
-                )
-                .await;
-                summary.update_analysis(&ttid, &analysis);
-
-                timeline_ids.insert(ttid.timeline_id);
-            } else {
-                tracing::info!(
-                    "Skip analysis of {} b/c a lower shard count than {}",
-                    ttid,
-                    highest_shard_count.0,
-                );
+                    // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
+                    // reference counts for layers across the tenant.
+                    let analysis = branch_cleanup_and_check_errors(
+                        remote_client,
+                        &ttid,
+                        &mut tenant_objects,
+                        None,
+                        None,
+                        Some(data),
+                    )
+                    .await;
+                    summary.update_analysis(&ttid, &analysis);
+
+                    timeline_ids.insert(ttid.timeline_id);
+                } else {
+                    tracing::info!(
+                        "Skip analysis of {} b/c a lower shard count than {}",
+                        ttid,
+                        highest_shard_count.0,
+                    );
+                }
             }
+            .instrument(
+                info_span!("analyze-timeline", shard = %ttid.tenant_shard_id.shard_slug(), timeline = %ttid.timeline_id),
+            )
+            .await
         }
 
         summary.timeline_count += timeline_ids.len();
@@ -278,6 +288,7 @@ pub async fn scan_pageserver_metadata(
                         timelines,
                         highest_shard_count,
                     )
+                    .instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
                     .await;
                     tenant_id = Some(ttid.tenant_shard_id.tenant_id);
                     highest_shard_count = ttid.tenant_shard_id.shard_count;
@@ -306,15 +317,18 @@ pub async fn scan_pageserver_metadata(
         tenant_timeline_results.push((ttid, data));
     }
 
+    let tenant_id = tenant_id.expect("Must be set if results are present");
+
     if !tenant_timeline_results.is_empty() {
         analyze_tenant(
             &remote_client,
-            tenant_id.expect("Must be set if results are present"),
+            tenant_id,
             &mut summary,
             tenant_objects,
             tenant_timeline_results,
             highest_shard_count,
         )
+        .instrument(info_span!("analyze-tenant", tenant = %tenant_id))
         .await;
     }
 

From 91d947654ec755820b0c7f74ea111d4865b17224 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 24 Sep 2024 09:44:45 +0200
Subject: [PATCH 46/77] Add regression tests for a cloud-based Neon instance
 (#8681)

## Problem
We need to be able to run the regression tests against a cloud-based
Neon staging instance to prepare the migration to the arm architecture.

## Summary of changes
Some tests were modified to work on the cloud instance (i.e. added
passwords, server-side copy changed to client-side, etc)

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/cloud-regress.yml           |  102 +
 patches/cloud_regress_pg16.patch              | 3949 +++++++++++++++++
 .../cloud_regress/test_cloud_regress.py       |  100 +
 test_runner/fixtures/utils.py                 |    2 +-
 4 files changed, 4152 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/cloud-regress.yml
 create mode 100644 patches/cloud_regress_pg16.patch
 create mode 100644 test_runner/cloud_regress/test_cloud_regress.py

diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
new file mode 100644
index 000000000000..de6babdde39a
--- /dev/null
+++ b/.github/workflows/cloud-regress.yml
@@ -0,0 +1,102 @@
+name: Cloud Regression Test
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '45 1 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  regress:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 16
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    runs-on: us-east-2
+    container:
+      image: neondatabase/build-tools:pinned
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Patch the test
+        run: |
+          cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
+          patch -p1 < "../../patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
+
+      - name: Generate a random password
+        id: pwgen
+        run: |
+          set +x
+          DBPASS=$(dd if=/dev/random bs=48 count=1 2>/dev/null | base64)
+          echo "::add-mask::${DBPASS//\//}"
+          echo DBPASS="${DBPASS//\//}" >> "${GITHUB_OUTPUT}"
+
+      - name: Change tests according to the generated password
+        env:
+          DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
+        run: |
+          cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
+          for fname in sql/*.sql expected/*.out; do
+            sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
+          done
+          for ph in $(grep NEON_MD5_PLACEHOLDER expected/password.out | awk '{print $3;}' | sort | uniq); do
+            USER=$(echo "${ph}" | cut -c 22-)
+            MD5=md5$(echo -n "${DBPASS}${USER}" | md5sum | awk '{print $1;}')
+            sed -i.bak "s/${ph}/${MD5}/" expected/password.out
+          done
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+
+      - name: Run the regression tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ env.BUILD_TYPE }}
+          test_selection: cloud_regress
+          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+          extra_params: -m remote_cluster
+        env:
+          BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
+
+      - name: Create Allure report
+        id: create-allure-report
+        if: ${{ !cancelled() }}
+        uses: ./.github/actions/allure-report-generate
+
+      - name: Post to a Slack channel
+        if: ${{ github.event.schedule && failure() }}
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: "C033QLM5P7D" # on-call-staging-stream
+          slack-message: |
+            Periodic pg_regress on staging: ${{ job.status }}
+            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+            <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
diff --git a/patches/cloud_regress_pg16.patch b/patches/cloud_regress_pg16.patch
new file mode 100644
index 000000000000..d15d0cffebc3
--- /dev/null
+++ b/patches/cloud_regress_pg16.patch
@@ -0,0 +1,3949 @@
+diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
+index 0c24f6afe4..dd808ac2b4 100644
+--- a/src/test/regress/expected/aggregates.out
++++ b/src/test/regress/expected/aggregates.out
+@@ -11,7 +11,8 @@ CREATE TABLE aggtest (
+ 	b			float4
+ );
+ \set filename :abs_srcdir '/data/agg.data'
+-COPY aggtest FROM :'filename';
++\set command '\\copy aggtest FROM ' :'filename';
++:command
+ ANALYZE aggtest;
+ SELECT avg(four) AS avg_1 FROM onek;
+        avg_1        
+diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out
+index ae54cb254f..888e2ee8bc 100644
+--- a/src/test/regress/expected/alter_generic.out
++++ b/src/test/regress/expected/alter_generic.out
+@@ -15,9 +15,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user1;
+ DROP ROLE IF EXISTS regress_alter_generic_user2;
+ DROP ROLE IF EXISTS regress_alter_generic_user3;
+ RESET client_min_messages;
+-CREATE USER regress_alter_generic_user3;
+-CREATE USER regress_alter_generic_user2;
+-CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3;
++CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3;
+ CREATE SCHEMA alt_nsp1;
+ CREATE SCHEMA alt_nsp2;
+ GRANT ALL ON SCHEMA alt_nsp1, alt_nsp2 TO public;
+@@ -370,7 +370,7 @@ ERROR:  STORAGE cannot be specified in ALTER OPERATOR FAMILY
+ DROP OPERATOR FAMILY alt_opf4 USING btree;
+ -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user5 NOSUPERUSER;
++CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER;
+ CREATE OPERATOR FAMILY alt_opf5 USING btree;
+ SET ROLE regress_alter_generic_user5;
+ ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2);
+@@ -382,7 +382,7 @@ ERROR:  current transaction is aborted, commands ignored until end of transactio
+ ROLLBACK;
+ -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user6;
++CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA alt_nsp6;
+ REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6;
+ CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree;
+diff --git a/src/test/regress/expected/alter_operator.out b/src/test/regress/expected/alter_operator.out
+index 71bd484282..066ea4ec0d 100644
+--- a/src/test/regress/expected/alter_operator.out
++++ b/src/test/regress/expected/alter_operator.out
+@@ -127,7 +127,7 @@ ERROR:  operator attribute "Restrict" not recognized
+ --
+ -- Test permission check. Must be owner to ALTER OPERATOR.
+ --
+-CREATE USER regress_alter_op_user;
++CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_alter_op_user;
+ ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE);
+ ERROR:  must be owner of operator ===
+diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
+index 0e439a6488..393f316c3e 100644
+--- a/src/test/regress/expected/alter_table.out
++++ b/src/test/regress/expected/alter_table.out
+@@ -5,7 +5,7 @@
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_alter_table_user1;
+ RESET client_min_messages;
+-CREATE USER regress_alter_table_user1;
++CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ --
+ -- add attribute
+ --
+@@ -3896,8 +3896,8 @@ DROP TABLE fail_part;
+ ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1);
+ ERROR:  relation "nonexistent" does not exist
+ -- check ownership of the source table
+-CREATE ROLE regress_test_me;
+-CREATE ROLE regress_test_not_me;
++CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE not_owned_by_me (LIKE list_parted);
+ ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+ SET SESSION AUTHORIZATION regress_test_me;
+diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
+index 57a283dc59..9672d526b4 100644
+--- a/src/test/regress/expected/arrays.out
++++ b/src/test/regress/expected/arrays.out
+@@ -18,7 +18,8 @@ CREATE TABLE array_op_test (
+ 	t			text[]
+ );
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_op_test FROM :'filename';
++\set command '\\copy array_op_test FROM ' :'filename';
++:command
+ ANALYZE array_op_test;
+ --
+ -- only the 'e' array is 0-based, the others are 1-based.
+diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
+index 93ed5e8cc0..54bd7d535c 100644
+--- a/src/test/regress/expected/btree_index.out
++++ b/src/test/regress/expected/btree_index.out
+@@ -20,13 +20,17 @@ CREATE TABLE bt_f8_heap (
+ 	random 		int4
+ );
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_i4_heap FROM :'filename';
++\set command '\\copy bt_i4_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_name_heap FROM :'filename';
++\set command '\\copy bt_name_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_txt_heap FROM :'filename';
++\set command '\\copy bt_txt_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_f8_heap FROM :'filename';
++\set command '\\copy bt_f8_heap FROM ' :'filename';
++:command
+ ANALYZE bt_i4_heap;
+ ANALYZE bt_name_heap;
+ ANALYZE bt_txt_heap;
+diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out
+index 542c2e098c..0062d3024f 100644
+--- a/src/test/regress/expected/cluster.out
++++ b/src/test/regress/expected/cluster.out
+@@ -308,7 +308,7 @@ WHERE pg_class.oid=indexrelid
+ -- Verify that toast tables are clusterable
+ CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index;
+ -- Verify that clustering all tables does in fact cluster the right ones
+-CREATE USER regress_clstr_user;
++CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE clstr_1 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_2 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_3 (a INT PRIMARY KEY);
+@@ -497,7 +497,7 @@ DROP TABLE clstrpart;
+ CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i);
+ CREATE INDEX ptnowner_i_idx ON ptnowner(i);
+ CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1);
+-CREATE ROLE regress_ptnowner;
++CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2);
+ ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
+ ALTER TABLE ptnowner OWNER TO regress_ptnowner;
+diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
+index 97bbe53b64..eac3d42a79 100644
+--- a/src/test/regress/expected/collate.icu.utf8.out
++++ b/src/test/regress/expected/collate.icu.utf8.out
+@@ -1016,7 +1016,7 @@ select * from collate_test1 where b ilike 'ABC';
+ 
+ reset enable_seqscan;
+ -- schema manipulation commands
+-CREATE ROLE regress_test_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA test_schema;
+ -- We need to do this this way to cope with varying names for encodings:
+ SET client_min_messages TO WARNING;
+diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out
+index cf0b80d616..e8e2a14a4a 100644
+--- a/src/test/regress/expected/constraints.out
++++ b/src/test/regress/expected/constraints.out
+@@ -349,7 +349,8 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT,
+ 	CONSTRAINT COPY_CON
+ 	CHECK (x > 3 AND y <> 'check failed' AND x < 7 ));
+ \set filename :abs_srcdir '/data/constro.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ SELECT * FROM COPY_TBL;
+  x |       y       | z 
+ ---+---------------+---
+@@ -358,7 +359,8 @@ SELECT * FROM COPY_TBL;
+ (2 rows)
+ 
+ \set filename :abs_srcdir '/data/constrf.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ ERROR:  new row for relation "copy_tbl" violates check constraint "copy_con"
+ DETAIL:  Failing row contains (7, check failed, 6).
+ CONTEXT:  COPY copy_tbl, line 2: "7	check failed	6"
+@@ -799,7 +801,7 @@ DETAIL:  Key (f1)=(3) conflicts with key (f1)=(3).
+ DROP TABLE deferred_excl;
+ -- Comments
+ -- Setup a low-level role to enforce non-superuser checks.
+-CREATE ROLE regress_constraint_comments;
++CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments;
+ CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0));
+ CREATE DOMAIN constraint_comments_dom AS int CONSTRAINT the_constraint CHECK (value > 0);
+@@ -819,7 +821,7 @@ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS NULL;
+ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL;
+ -- unauthorized user
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_constraint_comments_noaccess;
++CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments_noaccess;
+ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
+ ERROR:  must be owner of relation constraint_comments_tbl
+diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
+index 442e7aff2b..525f732b03 100644
+--- a/src/test/regress/expected/conversion.out
++++ b/src/test/regress/expected/conversion.out
+@@ -8,7 +8,7 @@
+ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+     AS :'regresslib', 'test_enc_conversion'
+     LANGUAGE C STRICT;
+-CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
++CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_conversion_user;
+ CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
+ --
+diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out
+index b48365ec98..a6ef910055 100644
+--- a/src/test/regress/expected/copy.out
++++ b/src/test/regress/expected/copy.out
+@@ -15,9 +15,11 @@ insert into copytest values('Unix',E'abc\ndef',2);
+ insert into copytest values('Mac',E'abc\rdef',3);
+ insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
+ \set filename :abs_builddir '/results/copytest.csv'
+-copy copytest to :'filename' csv;
++\set command '\\copy copytest to ' :'filename' csv;
++:command
+ create temp table copytest2 (like copytest);
+-copy copytest2 from :'filename' csv;
++\set command '\\copy copytest2 from ' :'filename' csv;
++:command
+ select * from copytest except select * from copytest2;
+  style | test | filler 
+ -------+------+--------
+@@ -25,8 +27,10 @@ select * from copytest except select * from copytest2;
+ 
+ truncate copytest2;
+ --- same test but with an escape char different from quote char
+-copy copytest to :'filename' csv quote '''' escape E'\\';
+-copy copytest2 from :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
++\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ select * from copytest except select * from copytest2;
+  style | test | filler 
+ -------+------+--------
+@@ -66,13 +70,16 @@ insert into parted_copytest select x,1,'One' from generate_series(1,1000) x;
+ insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x;
+ insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x;
+ \set filename :abs_builddir '/results/parted_copytest.csv'
+-copy (select * from parted_copytest order by a) to :'filename';
++\set command '\\copy (select * from parted_copytest order by a) to ' :'filename';
++:command
+ truncate parted_copytest;
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ -- Ensure COPY FREEZE errors for partitioned tables.
+ begin;
+ truncate parted_copytest;
+-copy parted_copytest from :'filename' (freeze);
++\set command '\\copy parted_copytest from ' :'filename' (freeze);
++:command
+ ERROR:  cannot perform COPY FREEZE on a partitioned table
+ rollback;
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+@@ -94,7 +101,8 @@ create trigger part_ins_trig
+ 	before insert on parted_copytest_a2
+ 	for each row
+ 	execute procedure part_ins_func();
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+ group by tableoid order by tableoid::regclass::name;
+       tableoid      | count |  sum   
+@@ -106,7 +114,8 @@ group by tableoid order by tableoid::regclass::name;
+ truncate table parted_copytest;
+ create index on parted_copytest (b);
+ drop trigger part_ins_trig on parted_copytest_a2;
+-copy parted_copytest from stdin;
++\set command '\\copy parted_copytest from ' stdin;
++:command
+ -- Ensure index entries were properly added during the copy.
+ select * from parted_copytest where b = 1;
+  a | b |  c   
+@@ -170,9 +179,9 @@ INFO:  progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progre
+ -- Generate COPY FROM report with FILE, with some excluded tuples.
+ truncate tab_progress_reporting;
+ \set filename :abs_srcdir '/data/emp.data'
+-copy tab_progress_reporting from :'filename'
+-	where (salary < 2000);
+-INFO:  progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": true, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true}
++\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)';
++:command
++INFO:  progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": false, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true}
+ drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
+ drop function notice_after_tab_progress_reporting();
+ drop table tab_progress_reporting;
+@@ -281,7 +290,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1);
+ -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org
+ -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY parted_si(id, data) FROM :'filename';
++\set command '\\COPY parted_si(id, data) FROM ' :'filename';
++:command
+ -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from
+ -- the wrong partition. This test is *not* guaranteed to trigger that bug, but
+ -- does so when shared_buffers is small enough.  To test if we encountered the
+diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
+index faf1a4d1b0..a44c97db52 100644
+--- a/src/test/regress/expected/copy2.out
++++ b/src/test/regress/expected/copy2.out
+@@ -553,8 +553,8 @@ select * from check_con_tbl;
+ (2 rows)
+ 
+ -- test with RLS enabled.
+-CREATE ROLE regress_rls_copy_user;
+-CREATE ROLE regress_rls_copy_user_colperms;
++CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ COPY rls_t1 (a, b, c) from stdin;
+ CREATE POLICY p1 ON rls_t1 FOR SELECT USING (a % 2 = 0);
+diff --git a/src/test/regress/expected/create_function_sql.out b/src/test/regress/expected/create_function_sql.out
+index 50aca5940f..42527142f6 100644
+--- a/src/test/regress/expected/create_function_sql.out
++++ b/src/test/regress/expected/create_function_sql.out
+@@ -4,7 +4,7 @@
+ -- Assorted tests using SQL-language functions
+ --
+ -- All objects made in this test are in temp_func_test schema
+-CREATE USER regress_unpriv_user;
++CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA temp_func_test;
+ GRANT ALL ON SCHEMA temp_func_test TO public;
+ SET search_path TO temp_func_test, public;
+diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
+index acfd9d1f4f..0eeb64e47a 100644
+--- a/src/test/regress/expected/create_index.out
++++ b/src/test/regress/expected/create_index.out
+@@ -51,7 +51,8 @@ CREATE TABLE fast_emp4000 (
+ 	home_base	 box
+ );
+ \set filename :abs_srcdir '/data/rect.data'
+-COPY slow_emp4000 FROM :'filename';
++\set command '\\copy slow_emp4000 FROM ' :'filename';
++:command
+ INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000;
+ ANALYZE slow_emp4000;
+ ANALYZE fast_emp4000;
+@@ -655,7 +656,8 @@ CREATE TABLE array_index_op_test (
+ 	t			text[]
+ );
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_index_op_test FROM :'filename';
++\set command '\\copy array_index_op_test FROM ' :'filename';
++:command
+ ANALYZE array_index_op_test;
+ SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno;
+  seqno |   i    |   t    
+@@ -2822,7 +2824,7 @@ END;
+ -- concurrently
+ REINDEX SCHEMA CONCURRENTLY schema_to_reindex;
+ -- Failure for unauthorized user
+-CREATE ROLE regress_reindexuser NOLOGIN;
++CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_reindexuser;
+ REINDEX SCHEMA schema_to_reindex;
+ ERROR:  must be owner of schema schema_to_reindex
+diff --git a/src/test/regress/expected/create_procedure.out b/src/test/regress/expected/create_procedure.out
+index 2177ba3509..ae3ca94d00 100644
+--- a/src/test/regress/expected/create_procedure.out
++++ b/src/test/regress/expected/create_procedure.out
+@@ -421,7 +421,7 @@ ERROR:  cp_testfunc1(integer) is not a procedure
+ DROP PROCEDURE nonexistent();
+ ERROR:  procedure nonexistent() does not exist
+ -- privileges
+-CREATE USER regress_cp_user1;
++CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT INSERT ON cp_test TO regress_cp_user1;
+ REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC;
+ SET ROLE regress_cp_user1;
+diff --git a/src/test/regress/expected/create_role.out b/src/test/regress/expected/create_role.out
+index 46d4f9efe9..fc2a28a2f6 100644
+--- a/src/test/regress/expected/create_role.out
++++ b/src/test/regress/expected/create_role.out
+@@ -1,28 +1,28 @@
+ -- ok, superuser can create users with any set of privileges
+-CREATE ROLE regress_role_super SUPERUSER;
+-CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS;
++CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION;
+-CREATE ROLE regress_role_limited_admin CREATEROLE;
+-CREATE ROLE regress_role_normal;
++CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, CREATEROLE user can't give away role attributes without having them
+ SET SESSION AUTHORIZATION regress_role_limited_admin;
+-CREATE ROLE regress_nosuch_superuser SUPERUSER;
++CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the SUPERUSER attribute may create roles with the SUPERUSER attribute.
+-CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS;
++CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute.
+-CREATE ROLE regress_nosuch_replication REPLICATION;
++CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute.
+-CREATE ROLE regress_nosuch_bypassrls BYPASSRLS;
++CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the BYPASSRLS attribute may create roles with the BYPASSRLS attribute.
+-CREATE ROLE regress_nosuch_createdb CREATEDB;
++CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the CREATEDB attribute may create roles with the CREATEDB attribute.
+ -- ok, can create a role without any special attributes
+-CREATE ROLE regress_role_limited;
++CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, can't give it in any of the restricted attributes
+ ALTER ROLE regress_role_limited SUPERUSER;
+ ERROR:  permission denied to alter role
+@@ -39,10 +39,10 @@ DETAIL:  Only roles with the BYPASSRLS attribute may change the BYPASSRLS attrib
+ DROP ROLE regress_role_limited;
+ -- ok, can give away these role attributes if you have them
+ SET SESSION AUTHORIZATION regress_role_admin;
+-CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_replication REPLICATION;
+-CREATE ROLE regress_bypassrls BYPASSRLS;
+-CREATE ROLE regress_createdb CREATEDB;
++CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, can toggle these role attributes off and on if you have them
+ ALTER ROLE regress_replication NOREPLICATION;
+ ALTER ROLE regress_replication REPLICATION;
+@@ -58,48 +58,48 @@ ALTER ROLE regress_createdb NOSUPERUSER;
+ ERROR:  permission denied to alter role
+ DETAIL:  Only roles with the SUPERUSER attribute may change the SUPERUSER attribute.
+ -- ok, having CREATEROLE is enough to create users with these privileges
+-CREATE ROLE regress_createrole CREATEROLE NOINHERIT;
++CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION;
+-CREATE ROLE regress_login LOGIN;
+-CREATE ROLE regress_inherit INHERIT;
+-CREATE ROLE regress_connection_limit CONNECTION LIMIT 5;
+-CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo';
+-CREATE ROLE regress_password_null PASSWORD NULL;
++CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, backwards compatible noise words should be ignored
+-CREATE ROLE regress_noiseword SYSID 12345;
++CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ NOTICE:  SYSID can no longer be specified
+ -- fail, cannot grant membership in superuser role
+-CREATE ROLE regress_nosuch_super IN ROLE regress_role_super;
++CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to grant role "regress_role_super"
+ DETAIL:  Only roles with the SUPERUSER attribute may grant roles with the SUPERUSER attribute.
+ -- fail, database owner cannot have members
+-CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner;
++CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "pg_database_owner" cannot have explicit members
+ -- ok, can grant other users into a role
+ CREATE ROLE regress_inroles ROLE
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, cannot grant a role into itself
+-CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive;
++CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "regress_nosuch_recursive" is a member of role "regress_nosuch_recursive"
+ -- ok, can grant other users into a role with admin option
+ CREATE ROLE regress_adminroles ADMIN
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, cannot grant a role into itself with admin option
+-CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive;
++CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "regress_nosuch_admin_recursive" is a member of role "regress_nosuch_admin_recursive"
+ -- fail, regress_createrole does not have CREATEDB privilege
+ SET SESSION AUTHORIZATION regress_createrole;
+ CREATE DATABASE regress_nosuch_db;
+ ERROR:  permission denied to create database
+ -- ok, regress_createrole can create new roles
+-CREATE ROLE regress_plainrole;
++CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, roles with CREATEROLE can create new roles with it
+-CREATE ROLE regress_rolecreator CREATEROLE;
++CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, roles with CREATEROLE can create new roles with different role
+ -- attributes, including CREATEROLE
+-CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5;
++CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, we should be able to modify a role we created
+ COMMENT ON ROLE regress_hasprivs IS 'some comment';
+ ALTER ROLE regress_hasprivs RENAME TO regress_tenant;
+@@ -141,7 +141,7 @@ ERROR:  permission denied to reassign objects
+ DETAIL:  Only roles with privileges of role "regress_tenant" may reassign objects owned by it.
+ -- ok, create a role with a value for createrole_self_grant
+ SET createrole_self_grant = 'set, inherit';
+-CREATE ROLE regress_tenant2;
++CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_tenant2;
+ -- ok, regress_tenant2 can create objects within the database
+ SET SESSION AUTHORIZATION regress_tenant2;
+@@ -165,34 +165,34 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2;
+ ERROR:  must be able to SET ROLE "regress_tenant2"
+ DROP TABLE tenant2_table;
+ -- fail, CREATEROLE is not enough to create roles in privileged roles
+-CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data;
++CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data;
+ ERROR:  permission denied to grant role "pg_read_all_data"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_data" may grant this role.
+-CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data;
++CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data;
+ ERROR:  permission denied to grant role "pg_write_all_data"
+ DETAIL:  Only roles with the ADMIN option on role "pg_write_all_data" may grant this role.
+-CREATE ROLE regress_monitor IN ROLE pg_monitor;
++CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor;
+ ERROR:  permission denied to grant role "pg_monitor"
+ DETAIL:  Only roles with the ADMIN option on role "pg_monitor" may grant this role.
+-CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings;
++CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings;
+ ERROR:  permission denied to grant role "pg_read_all_settings"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_settings" may grant this role.
+-CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats;
++CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats;
+ ERROR:  permission denied to grant role "pg_read_all_stats"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_stats" may grant this role.
+-CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables;
++CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables;
+ ERROR:  permission denied to grant role "pg_stat_scan_tables"
+ DETAIL:  Only roles with the ADMIN option on role "pg_stat_scan_tables" may grant this role.
+-CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files;
++CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files;
+ ERROR:  permission denied to grant role "pg_read_server_files"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_server_files" may grant this role.
+-CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files;
++CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files;
+ ERROR:  permission denied to grant role "pg_write_server_files"
+ DETAIL:  Only roles with the ADMIN option on role "pg_write_server_files" may grant this role.
+-CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program;
++CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program;
+ ERROR:  permission denied to grant role "pg_execute_server_program"
+ DETAIL:  Only roles with the ADMIN option on role "pg_execute_server_program" may grant this role.
+-CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend;
++CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend;
+ ERROR:  permission denied to grant role "pg_signal_backend"
+ DETAIL:  Only roles with the ADMIN option on role "pg_signal_backend" may grant this role.
+ -- fail, role still owns database objects
+diff --git a/src/test/regress/expected/create_schema.out b/src/test/regress/expected/create_schema.out
+index 93302a07ef..1a73f083ac 100644
+--- a/src/test/regress/expected/create_schema.out
++++ b/src/test/regress/expected/create_schema.out
+@@ -2,7 +2,7 @@
+ -- CREATE_SCHEMA
+ --
+ -- Schema creation with elements.
+-CREATE ROLE regress_create_schema_role SUPERUSER;
++CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Cases where schema creation fails as objects are qualified with a schema
+ -- that does not match with what's expected.
+ -- This checks all the object types that include schema qualifications.
+diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
+index f3f8c7b5a2..3e3e54ff4c 100644
+--- a/src/test/regress/expected/create_view.out
++++ b/src/test/regress/expected/create_view.out
+@@ -18,7 +18,8 @@ CREATE TABLE real_city (
+ 	outline 	path
+ );
+ \set filename :abs_srcdir '/data/real_city.data'
+-COPY real_city FROM :'filename';
++\set command '\\copy real_city FROM ' :'filename';
++:command
+ ANALYZE real_city;
+ SELECT *
+    INTO TABLE ramp
+diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out
+index 454db91ec0..01378d7081 100644
+--- a/src/test/regress/expected/database.out
++++ b/src/test/regress/expected/database.out
+@@ -1,8 +1,7 @@
+ CREATE DATABASE regression_tbd
+ 	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
+ ALTER DATABASE regression_tbd RENAME TO regression_utf8;
+-ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
+-ALTER DATABASE regression_utf8 RESET TABLESPACE;
++WARNING:  you need to manually restart any running background workers after this command
+ ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
+ -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
+ BEGIN;
+diff --git a/src/test/regress/expected/dependency.out b/src/test/regress/expected/dependency.out
+index 6d9498cdd1..692cf979d0 100644
+--- a/src/test/regress/expected/dependency.out
++++ b/src/test/regress/expected/dependency.out
+@@ -1,10 +1,10 @@
+ --
+ -- DEPENDENCIES
+ --
+-CREATE USER regress_dep_user;
+-CREATE USER regress_dep_user2;
+-CREATE USER regress_dep_user3;
+-CREATE GROUP regress_dep_group;
++CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE deptest (f1 serial primary key, f2 text);
+ GRANT SELECT ON TABLE deptest TO GROUP regress_dep_group;
+ GRANT ALL ON TABLE deptest TO regress_dep_user, regress_dep_user2;
+@@ -41,9 +41,9 @@ ERROR:  role "regress_dep_user3" cannot be dropped because some objects depend o
+ DROP TABLE deptest;
+ DROP USER regress_dep_user3;
+ -- Test DROP OWNED
+-CREATE USER regress_dep_user0;
+-CREATE USER regress_dep_user1;
+-CREATE USER regress_dep_user2;
++CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_dep_user0;
+ -- permission denied
+ DROP OWNED BY regress_dep_user1;
+diff --git a/src/test/regress/expected/drop_if_exists.out b/src/test/regress/expected/drop_if_exists.out
+index 5e44c2c3ce..eb3bb329fb 100644
+--- a/src/test/regress/expected/drop_if_exists.out
++++ b/src/test/regress/expected/drop_if_exists.out
+@@ -64,9 +64,9 @@ ERROR:  type "test_domain_exists" does not exist
+ ---
+ --- role/user/group
+ ---
+-CREATE USER regress_test_u1;
+-CREATE ROLE regress_test_r1;
+-CREATE GROUP regress_test_g1;
++CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ DROP USER regress_test_u2;
+ ERROR:  role "regress_test_u2" does not exist
+ DROP USER IF EXISTS regress_test_u1, regress_test_u2;
+diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out
+index 126f7047fe..0e2cc73426 100644
+--- a/src/test/regress/expected/equivclass.out
++++ b/src/test/regress/expected/equivclass.out
+@@ -384,7 +384,7 @@ set enable_nestloop = on;
+ set enable_mergejoin = off;
+ alter table ec1 enable row level security;
+ create policy p1 on ec1 using (f1 < '5'::int8alias1);
+-create user regress_user_ectest;
++create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select on ec0 to regress_user_ectest;
+ grant select on ec1 to regress_user_ectest;
+ -- without any RLS, we'll treat {a.ff, b.ff, 43} as an EquivalenceClass
+diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out
+index 5a10958df5..a578c06ebd 100644
+--- a/src/test/regress/expected/event_trigger.out
++++ b/src/test/regress/expected/event_trigger.out
+@@ -85,7 +85,7 @@ create event trigger regress_event_trigger2 on ddl_command_start
+ -- OK
+ comment on event trigger regress_event_trigger is 'test comment';
+ -- drop as non-superuser should fail
+-create role regress_evt_user;
++create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_evt_user;
+ create event trigger regress_event_trigger_noperms on ddl_command_start
+    execute procedure test_event_trigger();
+diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out
+index 6ed50fdcfa..caa00a345d 100644
+--- a/src/test/regress/expected/foreign_data.out
++++ b/src/test/regress/expected/foreign_data.out
+@@ -14,13 +14,13 @@ CREATE FUNCTION test_fdw_handler()
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_role2, regress_test_role_super, regress_test_indirect, regress_unprivileged_role;
+ RESET client_min_messages;
+-CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER;
++CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_foreign_data_user';
+-CREATE ROLE regress_test_role;
+-CREATE ROLE regress_test_role2;
+-CREATE ROLE regress_test_role_super SUPERUSER;
+-CREATE ROLE regress_test_indirect;
+-CREATE ROLE regress_unprivileged_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE FOREIGN DATA WRAPPER dummy;
+ COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
+ CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
+diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
+index 12e523c737..8872e23935 100644
+--- a/src/test/regress/expected/foreign_key.out
++++ b/src/test/regress/expected/foreign_key.out
+@@ -1968,7 +1968,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2
+   FOR VALUES IN (1600);
+ -- leave these tables around intentionally
+ -- test the case when the referenced table is owned by a different user
+-create role regress_other_partitioned_fk_owner;
++create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner;
+ set role regress_other_partitioned_fk_owner;
+ create table other_partitioned_fk(a int, b int) partition by list (a);
+diff --git a/src/test/regress/expected/generated.out b/src/test/regress/expected/generated.out
+index 0f623f7119..b48588a54e 100644
+--- a/src/test/regress/expected/generated.out
++++ b/src/test/regress/expected/generated.out
+@@ -534,7 +534,7 @@ CREATE TABLE gtest10a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) STOR
+ ALTER TABLE gtest10a DROP COLUMN b;
+ INSERT INTO gtest10a (a) VALUES (1);
+ -- privileges
+-CREATE USER regress_user11;
++CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED);
+ INSERT INTO gtest11s VALUES (1, 10), (2, 20);
+ GRANT SELECT (a, c) ON gtest11s TO regress_user11;
+diff --git a/src/test/regress/expected/guc.out b/src/test/regress/expected/guc.out
+index 127c953297..e6f8272f99 100644
+--- a/src/test/regress/expected/guc.out
++++ b/src/test/regress/expected/guc.out
+@@ -584,7 +584,7 @@ PREPARE foo AS SELECT 1;
+ LISTEN foo_event;
+ SET vacuum_cost_delay = 13;
+ CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS;
+-CREATE ROLE regress_guc_user;
++CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_guc_user;
+ -- look changes
+ SELECT pg_listening_channels();
+diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out
+index a2036a1597..805d73b9d2 100644
+--- a/src/test/regress/expected/hash_index.out
++++ b/src/test/regress/expected/hash_index.out
+@@ -20,10 +20,14 @@ CREATE TABLE hash_f8_heap (
+ 	random 		float8
+ );
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY hash_i4_heap FROM :'filename';
+-COPY hash_name_heap FROM :'filename';
+-COPY hash_txt_heap FROM :'filename';
+-COPY hash_f8_heap FROM :'filename';
++\set command '\\copy hash_i4_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_name_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_txt_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_f8_heap FROM ' :'filename';
++:command
+ -- the data in this file has a lot of duplicates in the index key
+ -- fields, leading to long bucket chains and lots of table expansion.
+ -- this is therefore a stress test of the bucket overflow code (unlike
+diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out
+index cc7772349f..98a08eb48d 100644
+--- a/src/test/regress/expected/identity.out
++++ b/src/test/regress/expected/identity.out
+@@ -520,7 +520,7 @@ ALTER TABLE itest7 ALTER COLUMN a SET GENERATED BY DEFAULT;
+ ALTER TABLE itest7 ALTER COLUMN a RESTART;
+ ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY;
+ -- privileges
+-CREATE USER regress_identity_user1;
++CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
+ SET ROLE regress_identity_user1;
+diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
+index 4943429e9b..0257f22b15 100644
+--- a/src/test/regress/expected/inherit.out
++++ b/src/test/regress/expected/inherit.out
+@@ -2606,7 +2606,7 @@ create index on permtest_parent (left(c, 3));
+ insert into permtest_parent
+   select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
+ analyze permtest_parent;
+-create role regress_no_child_access;
++create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ revoke all on permtest_grandchild from regress_no_child_access;
+ grant select on permtest_parent to regress_no_child_access;
+ set session authorization regress_no_child_access;
+diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
+index cf4b5221a8..fa6ccb639c 100644
+--- a/src/test/regress/expected/insert.out
++++ b/src/test/regress/expected/insert.out
+@@ -802,7 +802,7 @@ drop table mlparted5;
+ -- appropriate key description (or none) in various situations
+ create table key_desc (a int, b int) partition by list ((a+0));
+ create table key_desc_1 partition of key_desc for values in (1) partition by range (b);
+-create user regress_insert_other_user;
++create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select (a) on key_desc_1 to regress_insert_other_user;
+ grant insert on key_desc to regress_insert_other_user;
+ set role regress_insert_other_user;
+@@ -914,7 +914,7 @@ DETAIL:  Failing row contains (2, hi there).
+ -- check that the message shows the appropriate column description in a
+ -- situation where the partitioned table is not the primary ModifyTable node
+ create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int);
+-create role regress_coldesc_role;
++create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant insert on inserttest3 to regress_coldesc_role;
+ grant insert on brtrigpartcon to regress_coldesc_role;
+ revoke select on brtrigpartcon from regress_coldesc_role;
+diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out
+index f8a7dac960..64dcaf171c 100644
+--- a/src/test/regress/expected/jsonb.out
++++ b/src/test/regress/expected/jsonb.out
+@@ -4,7 +4,8 @@ CREATE TABLE testjsonb (
+        j jsonb
+ );
+ \set filename :abs_srcdir '/data/jsonb.data'
+-COPY testjsonb FROM :'filename';
++\set command '\\copy testjsonb FROM ' :'filename';
++:command
+ -- Strings.
+ SELECT '""'::jsonb;				-- OK.
+  jsonb 
+diff --git a/src/test/regress/expected/largeobject.out b/src/test/regress/expected/largeobject.out
+index 4921dd79ae..d18a3cdd66 100644
+--- a/src/test/regress/expected/largeobject.out
++++ b/src/test/regress/expected/largeobject.out
+@@ -7,7 +7,7 @@
+ -- ensure consistent test output regardless of the default bytea format
+ SET bytea_output TO escape;
+ -- Test ALTER LARGE OBJECT OWNER
+-CREATE ROLE regress_lo_user;
++CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT lo_create(42);
+  lo_create 
+ -----------
+@@ -346,7 +346,8 @@ SELECT lo_unlink(loid) from lotest_stash_values;
+ 
+ TRUNCATE lotest_stash_values;
+ \set filename :abs_srcdir '/data/tenk.data'
+-INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename');
++\lo_import :filename
++INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID);
+ BEGIN;
+ UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer));
+ -- verify length of large object
+@@ -410,12 +411,8 @@ SELECT lo_close(fd) FROM lotest_stash_values;
+ 
+ END;
+ \set filename :abs_builddir '/results/lotest.txt'
+-SELECT lo_export(loid, :'filename') FROM lotest_stash_values;
+- lo_export 
+------------
+-         1
+-(1 row)
+-
++SELECT loid FROM lotest_stash_values \gset
++\lo_export :loid, :filename
+ \lo_import :filename
+ \set newloid :LASTOID
+ -- just make sure \lo_export does not barf
+diff --git a/src/test/regress/expected/lock.out b/src/test/regress/expected/lock.out
+index ad137d3645..8dac447436 100644
+--- a/src/test/regress/expected/lock.out
++++ b/src/test/regress/expected/lock.out
+@@ -16,7 +16,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2;
+ CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1;
+ CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a);
+ CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub;
+-CREATE ROLE regress_rol_lock1;
++CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1;
+ GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1;
+ -- Try all valid lock options; also try omitting the optional TABLE keyword.
+diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out
+index 67a50bde3d..7eeafd2603 100644
+--- a/src/test/regress/expected/matview.out
++++ b/src/test/regress/expected/matview.out
+@@ -549,7 +549,7 @@ SELECT * FROM mvtest_mv_v;
+ DROP TABLE mvtest_v CASCADE;
+ NOTICE:  drop cascades to materialized view mvtest_mv_v
+ -- make sure running as superuser works when MV owned by another role (bug #11208)
+-CREATE ROLE regress_user_mvtest;
++CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_user_mvtest;
+ -- this test case also checks for ambiguity in the queries issued by
+ -- refresh_by_match_merge(), by choosing column names that intentionally
+@@ -615,7 +615,7 @@ HINT:  Use the REFRESH MATERIALIZED VIEW command.
+ ROLLBACK;
+ -- INSERT privileges if relation owner is not allowed to insert.
+ CREATE SCHEMA matview_schema;
+-CREATE USER regress_matview_user;
++CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user
+   REVOKE INSERT ON TABLES FROM regress_matview_user;
+ GRANT ALL ON SCHEMA matview_schema TO public;
+diff --git a/src/test/regress/expected/merge.out b/src/test/regress/expected/merge.out
+index bc9a59803f..5b9ddf0626 100644
+--- a/src/test/regress/expected/merge.out
++++ b/src/test/regress/expected/merge.out
+@@ -1,9 +1,9 @@
+ --
+ -- MERGE
+ --
+-CREATE USER regress_merge_privs;
+-CREATE USER regress_merge_no_privs;
+-CREATE USER regress_merge_none;
++CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ DROP TABLE IF EXISTS target;
+ NOTICE:  table "target" does not exist, skipping
+ DROP TABLE IF EXISTS source;
+diff --git a/src/test/regress/expected/misc.out b/src/test/regress/expected/misc.out
+index 6e816c57f1..6ef45b468e 100644
+--- a/src/test/regress/expected/misc.out
++++ b/src/test/regress/expected/misc.out
+@@ -59,9 +59,11 @@ DROP TABLE tmp;
+ -- copy
+ --
+ \set filename :abs_builddir '/results/onek.data'
+-COPY onek TO :'filename';
++\set command '\\copy onek TO ' :'filename';
++:command
+ CREATE TEMP TABLE onek_copy (LIKE onek);
+-COPY onek_copy FROM :'filename';
++\set command '\\copy onek_copy FROM ' :'filename';
++:command
+ SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy;
+  unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 
+ ---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------
+@@ -73,9 +75,11 @@ SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek;
+ (0 rows)
+ 
+ \set filename :abs_builddir '/results/stud_emp.data'
+-COPY BINARY stud_emp TO :'filename';
++\set command '\\COPY BINARY stud_emp TO ' :'filename';
++:command
+ CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp);
+-COPY BINARY stud_emp_copy FROM :'filename';
++\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename';
++:command
+ SELECT * FROM stud_emp_copy;
+  name  | age |  location  | salary | manager | gpa | percent 
+ -------+-----+------------+--------+---------+-----+---------
+diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out
+index c669948370..47111b1d24 100644
+--- a/src/test/regress/expected/misc_functions.out
++++ b/src/test/regress/expected/misc_functions.out
+@@ -297,7 +297,7 @@ SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity
+  t
+ (1 row)
+ 
+-CREATE ROLE regress_log_memory;
++CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT has_function_privilege('regress_log_memory',
+   'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no
+  has_function_privilege 
+@@ -483,7 +483,7 @@ select count(*) > 0 from
+ --
+ -- Test replication slot directory functions
+ --
+-CREATE ROLE regress_slot_dir_funcs;
++CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Not available by default.
+ SELECT has_function_privilege('regress_slot_dir_funcs',
+   'pg_ls_logicalsnapdir()', 'EXECUTE');
+diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out
+index fc42d418bf..e38f517574 100644
+--- a/src/test/regress/expected/object_address.out
++++ b/src/test/regress/expected/object_address.out
+@@ -5,7 +5,7 @@
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_addr_user;
+ RESET client_min_messages;
+-CREATE USER regress_addr_user;
++CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Test generic object addressing/identification functions
+ CREATE SCHEMA addr_nsp;
+ SET search_path TO 'addr_nsp';
+diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out
+index 8475231735..1afae5395f 100644
+--- a/src/test/regress/expected/password.out
++++ b/src/test/regress/expected/password.out
+@@ -12,11 +12,11 @@ SET password_encryption = 'md5'; -- ok
+ SET password_encryption = 'scram-sha-256'; -- ok
+ -- consistency of password entries
+ SET password_encryption = 'md5';
+-CREATE ROLE regress_passwd1 PASSWORD 'role_pwd1';
+-CREATE ROLE regress_passwd2 PASSWORD 'role_pwd2';
++CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET password_encryption = 'scram-sha-256';
+-CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
+-CREATE ROLE regress_passwd4 PASSWORD NULL;
++CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- check list of created entries
+ --
+ -- The scram secret will look something like:
+@@ -30,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+     ORDER BY rolname, rolpassword;
+      rolname     |                rolpassword_masked                 
+ -----------------+---------------------------------------------------
+- regress_passwd1 | md5783277baca28003b33453252be4dbb34
+- regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3
++ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd4 | 
++ regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+ (4 rows)
+ 
+ -- Rename a role
+@@ -54,24 +54,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+ -- passwords.
+ SET password_encryption = 'md5';
+ -- encrypt with MD5
+-ALTER ROLE regress_passwd2 PASSWORD 'foo';
++ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted, use as they are
+ ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ SET password_encryption = 'scram-sha-256';
+ -- create SCRAM secret
+-ALTER ROLE  regress_passwd4 PASSWORD 'foo';
++ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted with MD5, use as it is
+ CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- This looks like a valid SCRAM-SHA-256 secret, but it is not
+ -- so it should be hashed with SCRAM-SHA-256.
+ CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- These may look like valid MD5 secrets, but they are not, so they
+ -- should be hashed with SCRAM-SHA-256.
+ -- trailing garbage at the end
+ CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- invalid length
+ CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- Changing the SCRAM iteration count
+ SET scram_iterations = 1024;
+ CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount';
+@@ -81,63 +87,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+     ORDER BY rolname, rolpassword;
+      rolname     |                rolpassword_masked                 
+ -----------------+---------------------------------------------------
+- regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70
+- regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb
++ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023
+- regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd9 | SCRAM-SHA-256$1024:<salt>$<storedkey>:<serverkey>
+-(9 rows)
++(5 rows)
+ 
+ -- An empty password is not allowed, in any form
+ CREATE ROLE regress_passwd_empty PASSWORD '';
+ NOTICE:  empty string is not a valid password, clearing password
++ERROR:  Failed to get encrypted password: User "regress_passwd_empty" has no password assigned.
+ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a';
+-NOTICE:  empty string is not a valid password, clearing password
++ERROR:  role "regress_passwd_empty" does not exist
+ ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4=';
+-NOTICE:  empty string is not a valid password, clearing password
++ERROR:  role "regress_passwd_empty" does not exist
+ SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty';
+  rolpassword 
+ -------------
+- 
+-(1 row)
++(0 rows)
+ 
+ -- Test with invalid stored and server keys.
+ --
+ -- The first is valid, to act as a control. The others have too long
+ -- stored/server keys. They will be re-hashed.
+ CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- Check that the invalid secrets were re-hashed. A re-hashed secret
+ -- should not contain the original salt.
+ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed
+     FROM pg_authid
+     WHERE rolname LIKE 'regress_passwd_sha_len%'
+     ORDER BY rolname;
+-         rolname         | is_rolpassword_rehashed 
+--------------------------+-------------------------
+- regress_passwd_sha_len0 | f
+- regress_passwd_sha_len1 | t
+- regress_passwd_sha_len2 | t
+-(3 rows)
++ rolname | is_rolpassword_rehashed 
++---------+-------------------------
++(0 rows)
+ 
+ DROP ROLE regress_passwd1;
+ DROP ROLE regress_passwd2;
+ DROP ROLE regress_passwd3;
+ DROP ROLE regress_passwd4;
+ DROP ROLE regress_passwd5;
++ERROR:  role "regress_passwd5" does not exist
+ DROP ROLE regress_passwd6;
++ERROR:  role "regress_passwd6" does not exist
+ DROP ROLE regress_passwd7;
++ERROR:  role "regress_passwd7" does not exist
+ DROP ROLE regress_passwd8;
++ERROR:  role "regress_passwd8" does not exist
+ DROP ROLE regress_passwd9;
+ DROP ROLE regress_passwd_empty;
++ERROR:  role "regress_passwd_empty" does not exist
+ DROP ROLE regress_passwd_sha_len0;
++ERROR:  role "regress_passwd_sha_len0" does not exist
+ DROP ROLE regress_passwd_sha_len1;
++ERROR:  role "regress_passwd_sha_len1" does not exist
+ DROP ROLE regress_passwd_sha_len2;
++ERROR:  role "regress_passwd_sha_len2" does not exist
+ -- all entries should have been removed
+ SELECT rolname, rolpassword
+     FROM pg_authid
+diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
+index fbb0489a4f..2905194e2c 100644
+--- a/src/test/regress/expected/privileges.out
++++ b/src/test/regress/expected/privileges.out
+@@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
+ 
+ RESET client_min_messages;
+ -- test proper begins here
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
+-CREATE USER regress_priv_user5;	-- duplicate
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;	-- duplicate
+ ERROR:  role "regress_priv_user5" already exists
+-CREATE USER regress_priv_user6;
+-CREATE USER regress_priv_user7;
+-CREATE USER regress_priv_user8;
+-CREATE USER regress_priv_user9;
+-CREATE USER regress_priv_user10;
+-CREATE ROLE regress_priv_role;
++CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- circular ADMIN OPTION grants should be disallowed
+ GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION;
+ GRANT regress_priv_user1 TO regress_priv_user3 WITH ADMIN OPTION GRANTED BY regress_priv_user2;
+@@ -108,11 +108,11 @@ ERROR:  role "regress_priv_user5" cannot be dropped because some objects depend
+ DETAIL:  privileges for membership of role regress_priv_user6 in role regress_priv_user1
+ DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order
+ -- recreate the roles we just dropped
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT pg_read_all_data TO regress_priv_user6;
+ GRANT pg_write_all_data TO regress_priv_user7;
+ GRANT pg_read_all_settings TO regress_priv_user8 WITH ADMIN OPTION;
+@@ -145,8 +145,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8;
+ DROP USER regress_priv_user10;
+ DROP USER regress_priv_user9;
+ DROP USER regress_priv_user8;
+-CREATE GROUP regress_priv_group1;
+-CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2;
++CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
+ ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
+ GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
+ SET SESSION AUTHORIZATION regress_priv_user1;
+@@ -172,12 +172,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
+ ERROR:  permission denied to grant privileges as role "regress_priv_role"
+ DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
+ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
++ERROR:  permission denied to grant privileges as role "neondb_owner"
++DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY foo; -- error
+ ERROR:  role "foo" does not exist
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY regress_priv_user2; -- warning, noop
+ WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "regress_priv_user2"
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_USER;
++WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner"
+ REVOKE regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_ROLE;
++WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner"
+ DROP ROLE regress_priv_role;
+ SET SESSION AUTHORIZATION regress_priv_user1;
+ SELECT session_user, current_user;
+@@ -1709,7 +1713,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+ 
+ -- security-restricted operations
+ \c -
+-CREATE ROLE regress_sro_user;
++CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Check that index expressions and predicates are run as the table's owner
+ -- A dummy index function checking current_user
+ CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
+@@ -2601,8 +2605,8 @@ drop cascades to function testns.priv_testagg(integer)
+ drop cascades to function testns.priv_testproc(integer)
+ -- Change owner of the schema & and rename of new schema owner
+ \c -
+-CREATE ROLE regress_schemauser1 superuser login;
+-CREATE ROLE regress_schemauser2 superuser login;
++CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_schemauser1;
+ CREATE SCHEMA testns;
+ SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
+@@ -2725,7 +2729,7 @@ DROP USER regress_priv_user7;
+ DROP USER regress_priv_user8; -- does not exist
+ ERROR:  role "regress_priv_user8" does not exist
+ -- permissions with LOCK TABLE
+-CREATE USER regress_locktable_user;
++CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE lock_table (a int);
+ -- LOCK TABLE and SELECT permission
+ GRANT SELECT ON lock_table TO regress_locktable_user;
+@@ -2807,7 +2811,7 @@ DROP USER regress_locktable_user;
+ -- pg_backend_memory_contexts.
+ -- switch to superuser
+ \c -
+-CREATE ROLE regress_readallstats;
++CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
+  has_table_privilege 
+ ---------------------
+@@ -2851,10 +2855,10 @@ RESET ROLE;
+ -- clean up
+ DROP ROLE regress_readallstats;
+ -- test role grantor machinery
+-CREATE ROLE regress_group;
+-CREATE ROLE regress_group_direct_manager;
+-CREATE ROLE regress_group_indirect_manager;
+-CREATE ROLE regress_group_member;
++CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
+ GRANT regress_group_direct_manager TO regress_group_indirect_manager;
+ SET SESSION AUTHORIZATION regress_group_direct_manager;
+@@ -2883,9 +2887,9 @@ DROP ROLE regress_group_direct_manager;
+ DROP ROLE regress_group_indirect_manager;
+ DROP ROLE regress_group_member;
+ -- test SET and INHERIT options with object ownership changes
+-CREATE ROLE regress_roleoption_protagonist;
+-CREATE ROLE regress_roleoption_donor;
+-CREATE ROLE regress_roleoption_recipient;
++CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA regress_roleoption;
+ GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
+ GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
+diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out
+index 7cd0c27cca..d7a124ed68 100644
+--- a/src/test/regress/expected/psql.out
++++ b/src/test/regress/expected/psql.out
+@@ -2857,7 +2857,7 @@ Type                | func
+ -- check conditional am display
+ \pset expanded off
+ CREATE SCHEMA tableam_display;
+-CREATE ROLE regress_display_role;
++CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER SCHEMA tableam_display OWNER TO regress_display_role;
+ SET search_path TO tableam_display;
+ CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler;
+@@ -4808,7 +4808,7 @@ last error message: division by zero
+ last error code: 22012
+ \unset FETCH_COUNT
+ create schema testpart;
+-create role regress_partitioning_role;
++create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ alter schema testpart owner to regress_partitioning_role;
+ set role to regress_partitioning_role;
+ -- run test inside own schema and hide other partitions
+@@ -5260,7 +5260,7 @@ reset work_mem;
+ 
+ -- check \df+
+ -- we have to use functions with a predictable owner name, so make a role
+-create role regress_psql_user superuser;
++create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ begin;
+ set session authorization regress_psql_user;
+ create function psql_df_internal (float8)
+@@ -5544,11 +5544,14 @@ CREATE TEMPORARY TABLE reload_output(
+   line text
+ );
+ SELECT 1 AS a \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+   line   
+ ---------
+@@ -5587,13 +5590,15 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c;
+ -- COPY TO file
+ -- The data goes to :g_out_file and the status to :o_out_file
+ \set QUIET false
+-COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file';
++\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file';
++:command
+ -- DML command status
+ UPDATE onek SET unique1 = unique1 WHERE false;
+ \set QUIET true
+ \o
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -5610,7 +5615,8 @@ SELECT line FROM reload_output ORDER BY lineno;
+ (10 rows)
+ 
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+    line   
+ ----------
+@@ -5647,7 +5653,8 @@ COPY (SELECT 'foo1') TO STDOUT \; COPY (SELECT 'bar1') TO STDOUT;
+ COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file
+ \o
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -5656,7 +5663,8 @@ SELECT line FROM reload_output ORDER BY lineno;
+ (2 rows)
+ 
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -6619,10 +6627,10 @@ cross-database references are not implemented: "no.such.database"."no.such.schem
+ \dX "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ cross-database references are not implemented: "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ -- check \drg and \du
+-CREATE ROLE regress_du_role0;
+-CREATE ROLE regress_du_role1;
+-CREATE ROLE regress_du_role2;
+-CREATE ROLE regress_du_admin;
++CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role2 TO regress_du_admin WITH ADMIN TRUE;
+diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out
+index 69dc6cfd85..68390cc18a 100644
+--- a/src/test/regress/expected/publication.out
++++ b/src/test/regress/expected/publication.out
+@@ -1,9 +1,9 @@
+ --
+ -- PUBLICATION
+ --
+-CREATE ROLE regress_publication_user LOGIN SUPERUSER;
+-CREATE ROLE regress_publication_user2;
+-CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_publication_user';
+ -- suppress warning that depends on wal_level
+ SET client_min_messages = 'ERROR';
+@@ -1211,7 +1211,7 @@ ALTER PUBLICATION testpub2 ADD TABLE testpub_tbl1;  -- ok
+ DROP PUBLICATION testpub2;
+ DROP PUBLICATION testpub3;
+ SET ROLE regress_publication_user;
+-CREATE ROLE regress_publication_user3;
++CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_publication_user2 TO regress_publication_user3;
+ SET client_min_messages = 'ERROR';
+ CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test;
+diff --git a/src/test/regress/expected/regproc.out b/src/test/regress/expected/regproc.out
+index a9420850b8..bd3b5f312d 100644
+--- a/src/test/regress/expected/regproc.out
++++ b/src/test/regress/expected/regproc.out
+@@ -2,7 +2,7 @@
+ -- regproc
+ --
+ /* If objects exist, return oids */
+-CREATE ROLE regress_regrole_test;
++CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- without schemaname
+ SELECT regoper('||/');
+  regoper 
+diff --git a/src/test/regress/expected/roleattributes.out b/src/test/regress/expected/roleattributes.out
+index 5e6969b173..2c4d52237f 100644
+--- a/src/test/regress/expected/roleattributes.out
++++ b/src/test/regress/expected/roleattributes.out
+@@ -1,233 +1,233 @@
+ -- default for superuser is false
+-CREATE ROLE regress_test_def_superuser;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_superuser WITH NOSUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for inherit is true
+-CREATE ROLE regress_test_def_inherit;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_inherit WITH INHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for create role is false
+-CREATE ROLE regress_test_def_createrole;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
+-           rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
++           rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createrole WITH NOCREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for create database is false
+-CREATE ROLE regress_test_def_createdb;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
+-          rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
++          rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createdb WITH NOCREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for can login is false for role
+-CREATE ROLE regress_test_def_role_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
+-            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
++            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_role_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for can login is true for user
+-CREATE USER regress_test_def_user_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
+-            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
++            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER USER regress_test_user_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for replication is false
+-CREATE ROLE regress_test_def_replication;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
+-           rolname            | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_replication | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
++           rolname            | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_replication | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 |             | 
++CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_replication WITH NOREPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for bypassrls is false
+-CREATE ROLE regress_test_def_bypassrls;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 |             | 
++CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- clean up roles
+diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out
+index 97ca9bf72c..b2a7a6f710 100644
+--- a/src/test/regress/expected/rowsecurity.out
++++ b/src/test/regress/expected/rowsecurity.out
+@@ -14,13 +14,13 @@ DROP ROLE IF EXISTS regress_rls_group2;
+ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE;
+ RESET client_min_messages;
+ -- initial setup
+-CREATE USER regress_rls_alice NOLOGIN;
+-CREATE USER regress_rls_bob NOLOGIN;
+-CREATE USER regress_rls_carol NOLOGIN;
+-CREATE USER regress_rls_dave NOLOGIN;
+-CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN;
+-CREATE ROLE regress_rls_group1 NOLOGIN;
+-CREATE ROLE regress_rls_group2 NOLOGIN;
++CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_rls_group1 TO regress_rls_bob;
+ GRANT regress_rls_group2 TO regress_rls_carol;
+ CREATE SCHEMA regress_rls_schema;
+@@ -4352,8 +4352,8 @@ SELECT count(*) = 0 FROM pg_depend
+ 
+ -- DROP OWNED BY testing
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_rls_dob_role1;
+-CREATE ROLE regress_rls_dob_role2;
++CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE dob_t1 (c1 int);
+ CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1);
+ CREATE POLICY p1 ON dob_t1 TO regress_rls_dob_role1 USING (true);
+diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
+index 09a255649b..15895f0c53 100644
+--- a/src/test/regress/expected/rules.out
++++ b/src/test/regress/expected/rules.out
+@@ -3708,7 +3708,7 @@ DROP TABLE ruletest2;
+ -- Test non-SELECT rule on security invoker view.
+ -- Should use view owner's permissions.
+ --
+-CREATE USER regress_rule_user1;
++CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ruletest_t1 (x int);
+ CREATE TABLE ruletest_t2 (x int);
+ CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS
+diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out
+index a8e01a6220..5a9cef4ede 100644
+--- a/src/test/regress/expected/security_label.out
++++ b/src/test/regress/expected/security_label.out
+@@ -6,8 +6,8 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_seclabel_user1;
+ DROP ROLE IF EXISTS regress_seclabel_user2;
+ RESET client_min_messages;
+-CREATE USER regress_seclabel_user1 WITH CREATEROLE;
+-CREATE USER regress_seclabel_user2;
++CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE seclabel_tbl1 (a int, b text);
+ CREATE TABLE seclabel_tbl2 (x int, y text);
+ CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2;
+@@ -19,21 +19,21 @@ ALTER TABLE seclabel_tbl2 OWNER TO regress_seclabel_user2;
+ -- Test of SECURITY LABEL statement without a plugin
+ --
+ SECURITY LABEL ON TABLE seclabel_tbl1 IS 'classified';			-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL FOR 'dummy' ON TABLE seclabel_tbl1 IS 'classified';		-- fail
+ ERROR:  security label provider "dummy" is not loaded
+ SECURITY LABEL ON TABLE seclabel_tbl1 IS '...invalid label...';		-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL ON TABLE seclabel_tbl3 IS 'unclassified';			-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL ON ROLE regress_seclabel_user1 IS 'classified';			-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL FOR 'dummy' ON ROLE regress_seclabel_user1 IS 'classified';		-- fail
+ ERROR:  security label provider "dummy" is not loaded
+ SECURITY LABEL ON ROLE regress_seclabel_user1 IS '...invalid label...';		-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ SECURITY LABEL ON ROLE regress_seclabel_user3 IS 'unclassified';			-- fail
+-ERROR:  no security label providers have been loaded
++ERROR:  must specify provider when multiple security label providers have been loaded
+ -- clean up objects
+ DROP FUNCTION seclabel_four();
+ DROP DOMAIN seclabel_domain;
+diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out
+index b79fe9a1c0..e29fab88ab 100644
+--- a/src/test/regress/expected/select_into.out
++++ b/src/test/regress/expected/select_into.out
+@@ -15,7 +15,7 @@ DROP TABLE sitmp1;
+ -- SELECT INTO and INSERT permission, if owner is not allowed to insert.
+ --
+ CREATE SCHEMA selinto_schema;
+-CREATE USER regress_selinto_user;
++CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
+ 	  REVOKE INSERT ON TABLES FROM regress_selinto_user;
+ GRANT ALL ON SCHEMA selinto_schema TO public;
+diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out
+index 1aeed8452b..7d9427d070 100644
+--- a/src/test/regress/expected/select_views.out
++++ b/src/test/regress/expected/select_views.out
+@@ -1250,7 +1250,7 @@ SELECT * FROM toyemp WHERE name = 'sharon';
+ --
+ -- Test for Leaky view scenario
+ --
+-CREATE ROLE regress_alice;
++CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE FUNCTION f_leak (text)
+        RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001
+        AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END';
+diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out
+index f02f020542..c9e0fda350 100644
+--- a/src/test/regress/expected/sequence.out
++++ b/src/test/regress/expected/sequence.out
+@@ -22,7 +22,7 @@ CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid;  -- not a table
+ ERROR:  sequence cannot be owned by relation "pg_class_oid_index"
+ DETAIL:  This operation is not supported for indexes.
+ CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname;  -- not same schema
+-ERROR:  sequence must be in same schema as table it is linked to
++ERROR:  sequence must have same owner as table it is linked to
+ CREATE TABLE sequence_test_table (a int);
+ CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b;  -- wrong column
+ ERROR:  column "b" of relation "sequence_test_table" does not exist
+@@ -639,7 +639,7 @@ SELECT setval('sequence_test2', 1);  -- error
+ ERROR:  cannot execute setval() in a read-only transaction
+ ROLLBACK;
+ -- privileges tests
+-CREATE USER regress_seq_user;
++CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- nextval
+ BEGIN;
+ SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out
+index 94187e59cf..72346e2c71 100644
+--- a/src/test/regress/expected/stats.out
++++ b/src/test/regress/expected/stats.out
+@@ -1283,37 +1283,6 @@ SELECT current_setting('fsync') = 'off'
+  t
+ (1 row)
+ 
+--- Change the tablespace so that the table is rewritten directly, then SELECT
+--- from it to cause it to be read back into shared buffers.
+-SELECT sum(reads) AS io_sum_shared_before_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+--- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly
+--- rewritten table, e.g. by autovacuum.
+-BEGIN;
+-ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace;
+--- SELECT from the table so that the data is read into shared buffers and
+--- context 'normal', object 'relation' reads are counted.
+-SELECT COUNT(*) FROM test_io_shared;
+- count 
+--------
+-   100
+-(1 row)
+-
+-COMMIT;
+-SELECT pg_stat_force_next_flush();
+- pg_stat_force_next_flush 
+---------------------------
+- 
+-(1 row)
+-
+-SELECT sum(reads) AS io_sum_shared_after_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation'  \gset
+-SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ SELECT sum(hits) AS io_sum_shared_before_hits
+   FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+ -- Select from the table again to count hits.
+@@ -1415,6 +1384,7 @@ SELECT :io_sum_local_after_evictions > :io_sum_local_before_evictions,
+ -- local buffers, exercising a different codepath than standard local buffer
+ -- writes.
+ ALTER TABLE test_io_local SET TABLESPACE regress_tblspace;
++ERROR:  tablespace "regress_tblspace" does not exist
+ SELECT pg_stat_force_next_flush();
+  pg_stat_force_next_flush 
+ --------------------------
+@@ -1426,7 +1396,7 @@ SELECT sum(writes) AS io_sum_local_new_tblspc_writes
+ SELECT :io_sum_local_new_tblspc_writes > :io_sum_local_after_writes;
+  ?column? 
+ ----------
+- t
++ f
+ (1 row)
+ 
+ RESET temp_buffers;
+diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
+index b4c85613de..d32a9a69ad 100644
+--- a/src/test/regress/expected/stats_ext.out
++++ b/src/test/regress/expected/stats_ext.out
+@@ -70,7 +70,7 @@ DROP TABLE ext_stats_test;
+ CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
+ CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment';
+-CREATE ROLE regress_stats_ext;
++CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_stats_ext;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment';
+ ERROR:  must be owner of statistics object ab1_a_b_stats
+@@ -3214,7 +3214,7 @@ set search_path to public, stts_s1;
+  stts_s1 | stts_foo               | col1, col2 FROM stts_t3                                          | defined   | defined      | defined
+ (10 rows)
+ 
+-create role regress_stats_ext nosuperuser;
++create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_stats_ext;
+ \dX
+                                                        List of extended statistics
+@@ -3237,7 +3237,7 @@ drop schema stts_s1, stts_s2 cascade;
+ drop user regress_stats_ext;
+ reset search_path;
+ -- User with no access
+-CREATE USER regress_stats_user1;
++CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT USAGE ON SCHEMA tststats TO regress_stats_user1;
+ SET SESSION AUTHORIZATION regress_stats_user1;
+ SELECT * FROM tststats.priv_test_tbl; -- Permission denied
+diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out
+index b15eddbff3..e9ba4568eb 100644
+--- a/src/test/regress/expected/subscription.out
++++ b/src/test/regress/expected/subscription.out
+@@ -1,10 +1,10 @@
+ --
+ -- SUBSCRIPTION
+ --
+-CREATE ROLE regress_subscription_user LOGIN SUPERUSER;
+-CREATE ROLE regress_subscription_user2;
+-CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription;
+-CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription;
++CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_subscription_user';
+ -- fail - no publications
+ CREATE SUBSCRIPTION regress_testsub CONNECTION 'foo';
+diff --git a/src/test/regress/expected/test_setup.out b/src/test/regress/expected/test_setup.out
+index 5d9e6bf12b..c5fddfdca6 100644
+--- a/src/test/regress/expected/test_setup.out
++++ b/src/test/regress/expected/test_setup.out
+@@ -21,6 +21,7 @@ GRANT ALL ON SCHEMA public TO public;
+ -- Create a tablespace we can use in tests.
+ SET allow_in_place_tablespaces = true;
+ CREATE TABLESPACE regress_tblspace LOCATION '';
++ERROR:  CREATE TABLESPACE is not supported on Neon
+ --
+ -- These tables have traditionally been referenced by many tests,
+ -- so create and populate them.  Insert only non-error values here.
+@@ -111,7 +112,8 @@ CREATE TABLE onek (
+ 	string4		name
+ );
+ \set filename :abs_srcdir '/data/onek.data'
+-COPY onek FROM :'filename';
++\set command '\\copy onek FROM ' :'filename';
++:command
+ VACUUM ANALYZE onek;
+ CREATE TABLE onek2 AS SELECT * FROM onek;
+ VACUUM ANALYZE onek2;
+@@ -134,7 +136,8 @@ CREATE TABLE tenk1 (
+ 	string4		name
+ );
+ \set filename :abs_srcdir '/data/tenk.data'
+-COPY tenk1 FROM :'filename';
++\set command '\\copy tenk1 FROM ' :'filename';
++:command
+ VACUUM ANALYZE tenk1;
+ CREATE TABLE tenk2 AS SELECT * FROM tenk1;
+ VACUUM ANALYZE tenk2;
+@@ -144,20 +147,23 @@ CREATE TABLE person (
+ 	location 	point
+ );
+ \set filename :abs_srcdir '/data/person.data'
+-COPY person FROM :'filename';
++\set command '\\copy person FROM ' :'filename';
++:command
+ VACUUM ANALYZE person;
+ CREATE TABLE emp (
+ 	salary 		int4,
+ 	manager 	name
+ ) INHERITS (person);
+ \set filename :abs_srcdir '/data/emp.data'
+-COPY emp FROM :'filename';
++\set command '\\copy emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE emp;
+ CREATE TABLE student (
+ 	gpa 		float8
+ ) INHERITS (person);
+ \set filename :abs_srcdir '/data/student.data'
+-COPY student FROM :'filename';
++\set command '\\copy student FROM ' :'filename';
++:command
+ VACUUM ANALYZE student;
+ CREATE TABLE stud_emp (
+ 	percent 	int4
+@@ -166,14 +172,16 @@ NOTICE:  merging multiple inherited definitions of column "name"
+ NOTICE:  merging multiple inherited definitions of column "age"
+ NOTICE:  merging multiple inherited definitions of column "location"
+ \set filename :abs_srcdir '/data/stud_emp.data'
+-COPY stud_emp FROM :'filename';
++\set command '\\copy stud_emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE stud_emp;
+ CREATE TABLE road (
+ 	name		text,
+ 	thepath 	path
+ );
+ \set filename :abs_srcdir '/data/streets.data'
+-COPY road FROM :'filename';
++\set command '\\copy road FROM ' :'filename';
++:command
+ VACUUM ANALYZE road;
+ CREATE TABLE ihighway () INHERITS (road);
+ INSERT INTO ihighway
+diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
+index 9fad6c8b04..a1b8e82389 100644
+--- a/src/test/regress/expected/tsearch.out
++++ b/src/test/regress/expected/tsearch.out
+@@ -63,7 +63,8 @@ CREATE TABLE test_tsvector(
+ 	a tsvector
+ );
+ \set filename :abs_srcdir '/data/tsearch.data'
+-COPY test_tsvector FROM :'filename';
++\set command '\\copy test_tsvector FROM ' :'filename';
++:command
+ ANALYZE test_tsvector;
+ -- test basic text search behavior without indexes, then with
+ SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
+diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out
+index ba46c32029..eac3017bac 100644
+--- a/src/test/regress/expected/updatable_views.out
++++ b/src/test/regress/expected/updatable_views.out
+@@ -999,9 +999,9 @@ NOTICE:  drop cascades to 2 other objects
+ DETAIL:  drop cascades to view rw_view1
+ drop cascades to function rw_view1_aa(rw_view1)
+ -- permissions checks
+-CREATE USER regress_view_user1;
+-CREATE USER regress_view_user2;
+-CREATE USER regress_view_user3;
++CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_view_user1;
+ CREATE TABLE base_tbl(a int, b text, c float);
+ INSERT INTO base_tbl VALUES (1, 'Row 1', 1.0);
+@@ -3094,8 +3094,8 @@ DETAIL:  View columns that are not columns of their base relation are not updata
+ drop view uv_iocu_view;
+ drop table uv_iocu_tab;
+ -- ON CONFLICT DO UPDATE permissions checks
+-create user regress_view_user1;
+-create user regress_view_user2;
++create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set session authorization regress_view_user1;
+ create table base_tbl(a int unique, b text, c float);
+ insert into base_tbl values (1,'xxx',1.0);
+diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
+index c809f88f54..d1d57852d4 100644
+--- a/src/test/regress/expected/update.out
++++ b/src/test/regress/expected/update.out
+@@ -602,7 +602,7 @@ DROP FUNCTION func_parted_mod_b();
+ -- RLS policies with update-row-movement
+ -----------------------------------------
+ ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY;
+-CREATE USER regress_range_parted_user;
++CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT ALL ON range_parted, mintab TO regress_range_parted_user;
+ CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true);
+ CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0);
+diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out
+index 4aaf4f025d..40a339758a 100644
+--- a/src/test/regress/expected/vacuum.out
++++ b/src/test/regress/expected/vacuum.out
+@@ -433,7 +433,7 @@ CREATE TABLE vacowned (a int);
+ CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a);
+ CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1);
+ CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2);
+-CREATE ROLE regress_vacuum;
++CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_vacuum;
+ -- Simple table
+ VACUUM vacowned;
+diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
+index 3d14bf4e4f..87f351b1d1 100644
+--- a/src/test/regress/parallel_schedule
++++ b/src/test/regress/parallel_schedule
+@@ -130,4 +130,4 @@ test: fast_default
+ 
+ # run tablespace test at the end because it drops the tablespace created during
+ # setup that other tests may use.
+-test: tablespace
++#test: tablespace
+diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
+index f51726e8ed..8854104eff 100644
+--- a/src/test/regress/sql/aggregates.sql
++++ b/src/test/regress/sql/aggregates.sql
+@@ -15,7 +15,8 @@ CREATE TABLE aggtest (
+ );
+ 
+ \set filename :abs_srcdir '/data/agg.data'
+-COPY aggtest FROM :'filename';
++\set command '\\copy aggtest FROM ' :'filename';
++:command
+ 
+ ANALYZE aggtest;
+ 
+diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql
+index de58d268d3..9d38df7f42 100644
+--- a/src/test/regress/sql/alter_generic.sql
++++ b/src/test/regress/sql/alter_generic.sql
+@@ -22,9 +22,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user3;
+ 
+ RESET client_min_messages;
+ 
+-CREATE USER regress_alter_generic_user3;
+-CREATE USER regress_alter_generic_user2;
+-CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3;
++CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3;
+ 
+ CREATE SCHEMA alt_nsp1;
+ CREATE SCHEMA alt_nsp2;
+@@ -316,7 +316,7 @@ DROP OPERATOR FAMILY alt_opf4 USING btree;
+ 
+ -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user5 NOSUPERUSER;
++CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER;
+ CREATE OPERATOR FAMILY alt_opf5 USING btree;
+ SET ROLE regress_alter_generic_user5;
+ ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2);
+@@ -326,7 +326,7 @@ ROLLBACK;
+ 
+ -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user6;
++CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA alt_nsp6;
+ REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6;
+ CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree;
+diff --git a/src/test/regress/sql/alter_operator.sql b/src/test/regress/sql/alter_operator.sql
+index fd40370165..ca8055e06d 100644
+--- a/src/test/regress/sql/alter_operator.sql
++++ b/src/test/regress/sql/alter_operator.sql
+@@ -87,7 +87,7 @@ ALTER OPERATOR & (bit, bit) SET ("Restrict" = _int_contsel, "Join" = _int_contjo
+ --
+ -- Test permission check. Must be owner to ALTER OPERATOR.
+ --
+-CREATE USER regress_alter_op_user;
++CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_alter_op_user;
+ 
+ ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE);
+diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
+index d2845abc97..a0719b8d0e 100644
+--- a/src/test/regress/sql/alter_table.sql
++++ b/src/test/regress/sql/alter_table.sql
+@@ -7,7 +7,7 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_alter_table_user1;
+ RESET client_min_messages;
+ 
+-CREATE USER regress_alter_table_user1;
++CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ --
+ -- add attribute
+@@ -2397,8 +2397,8 @@ DROP TABLE fail_part;
+ ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1);
+ 
+ -- check ownership of the source table
+-CREATE ROLE regress_test_me;
+-CREATE ROLE regress_test_not_me;
++CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE not_owned_by_me (LIKE list_parted);
+ ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+ SET SESSION AUTHORIZATION regress_test_me;
+diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
+index e414fa560d..79a75a0e57 100644
+--- a/src/test/regress/sql/arrays.sql
++++ b/src/test/regress/sql/arrays.sql
+@@ -22,7 +22,8 @@ CREATE TABLE array_op_test (
+ );
+ 
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_op_test FROM :'filename';
++\set command '\\copy array_op_test FROM ' :'filename';
++:command
+ ANALYZE array_op_test;
+ 
+ --
+diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
+index 239f4a4755..f29d87bdff 100644
+--- a/src/test/regress/sql/btree_index.sql
++++ b/src/test/regress/sql/btree_index.sql
+@@ -26,16 +26,20 @@ CREATE TABLE bt_f8_heap (
+ );
+ 
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_i4_heap FROM :'filename';
++\set command '\\copy bt_i4_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_name_heap FROM :'filename';
++\set command '\\copy bt_name_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_txt_heap FROM :'filename';
++\set command '\\copy bt_txt_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_f8_heap FROM :'filename';
++\set command '\\copy bt_f8_heap FROM ' :'filename';
++:command
+ 
+ ANALYZE bt_i4_heap;
+ ANALYZE bt_name_heap;
+diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql
+index 6cb9c926c0..5e689e4062 100644
+--- a/src/test/regress/sql/cluster.sql
++++ b/src/test/regress/sql/cluster.sql
+@@ -108,7 +108,7 @@ WHERE pg_class.oid=indexrelid
+ CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index;
+ 
+ -- Verify that clustering all tables does in fact cluster the right ones
+-CREATE USER regress_clstr_user;
++CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE clstr_1 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_2 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_3 (a INT PRIMARY KEY);
+@@ -233,7 +233,7 @@ DROP TABLE clstrpart;
+ CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i);
+ CREATE INDEX ptnowner_i_idx ON ptnowner(i);
+ CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1);
+-CREATE ROLE regress_ptnowner;
++CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2);
+ ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
+ ALTER TABLE ptnowner OWNER TO regress_ptnowner;
+diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
+index 3db9e25913..c66d5aa2c2 100644
+--- a/src/test/regress/sql/collate.icu.utf8.sql
++++ b/src/test/regress/sql/collate.icu.utf8.sql
+@@ -353,7 +353,7 @@ reset enable_seqscan;
+ 
+ -- schema manipulation commands
+ 
+-CREATE ROLE regress_test_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA test_schema;
+ 
+ -- We need to do this this way to cope with varying names for encodings:
+diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql
+index e3e3bea709..fa86ddc326 100644
+--- a/src/test/regress/sql/constraints.sql
++++ b/src/test/regress/sql/constraints.sql
+@@ -243,12 +243,14 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT,
+ 	CHECK (x > 3 AND y <> 'check failed' AND x < 7 ));
+ 
+ \set filename :abs_srcdir '/data/constro.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ 
+ SELECT * FROM COPY_TBL;
+ 
+ \set filename :abs_srcdir '/data/constrf.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ 
+ SELECT * FROM COPY_TBL;
+ 
+@@ -599,7 +601,7 @@ DROP TABLE deferred_excl;
+ 
+ -- Comments
+ -- Setup a low-level role to enforce non-superuser checks.
+-CREATE ROLE regress_constraint_comments;
++CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments;
+ 
+ CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0));
+@@ -621,7 +623,7 @@ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL;
+ 
+ -- unauthorized user
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_constraint_comments_noaccess;
++CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments_noaccess;
+ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
+ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment';
+diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
+index 9a65fca91f..58431a3056 100644
+--- a/src/test/regress/sql/conversion.sql
++++ b/src/test/regress/sql/conversion.sql
+@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
+     AS :'regresslib', 'test_enc_conversion'
+     LANGUAGE C STRICT;
+ 
+-CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
++CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_conversion_user;
+ CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
+ --
+diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql
+index 43d2e906dd..6c993d70f0 100644
+--- a/src/test/regress/sql/copy.sql
++++ b/src/test/regress/sql/copy.sql
+@@ -20,11 +20,13 @@ insert into copytest values('Mac',E'abc\rdef',3);
+ insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
+ 
+ \set filename :abs_builddir '/results/copytest.csv'
+-copy copytest to :'filename' csv;
++\set command '\\copy copytest to ' :'filename' csv;
++:command
+ 
+ create temp table copytest2 (like copytest);
+ 
+-copy copytest2 from :'filename' csv;
++\set command '\\copy copytest2 from ' :'filename' csv;
++:command
+ 
+ select * from copytest except select * from copytest2;
+ 
+@@ -32,9 +34,11 @@ truncate copytest2;
+ 
+ --- same test but with an escape char different from quote char
+ 
+-copy copytest to :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ 
+-copy copytest2 from :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ 
+ select * from copytest except select * from copytest2;
+ 
+@@ -86,16 +90,19 @@ insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x;
+ insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x;
+ 
+ \set filename :abs_builddir '/results/parted_copytest.csv'
+-copy (select * from parted_copytest order by a) to :'filename';
++\set command '\\copy (select * from parted_copytest order by a) to ' :'filename';
++:command
+ 
+ truncate parted_copytest;
+ 
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ 
+ -- Ensure COPY FREEZE errors for partitioned tables.
+ begin;
+ truncate parted_copytest;
+-copy parted_copytest from :'filename' (freeze);
++\set command '\\copy parted_copytest from ' :'filename' (freeze);
++:command
+ rollback;
+ 
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+@@ -115,7 +122,8 @@ create trigger part_ins_trig
+ 	for each row
+ 	execute procedure part_ins_func();
+ 
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ 
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+ group by tableoid order by tableoid::regclass::name;
+@@ -124,7 +132,8 @@ truncate table parted_copytest;
+ create index on parted_copytest (b);
+ drop trigger part_ins_trig on parted_copytest_a2;
+ 
+-copy parted_copytest from stdin;
++\set command '\\copy parted_copytest from ' stdin;
++:command
+ 1	1	str1
+ 2	2	str2
+ \.
+@@ -191,8 +200,8 @@ bill	20	(11,10)	1000	sharon
+ -- Generate COPY FROM report with FILE, with some excluded tuples.
+ truncate tab_progress_reporting;
+ \set filename :abs_srcdir '/data/emp.data'
+-copy tab_progress_reporting from :'filename'
+-	where (salary < 2000);
++\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)';
++:command
+ 
+ drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
+ drop function notice_after_tab_progress_reporting();
+@@ -311,7 +320,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1);
+ -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org
+ -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY parted_si(id, data) FROM :'filename';
++\set command '\\COPY parted_si(id, data) FROM ' :'filename';
++:command
+ 
+ -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from
+ -- the wrong partition. This test is *not* guaranteed to trigger that bug, but
+diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
+index d759635068..d58e50dcc5 100644
+--- a/src/test/regress/sql/copy2.sql
++++ b/src/test/regress/sql/copy2.sql
+@@ -365,8 +365,8 @@ copy check_con_tbl from stdin;
+ select * from check_con_tbl;
+ 
+ -- test with RLS enabled.
+-CREATE ROLE regress_rls_copy_user;
+-CREATE ROLE regress_rls_copy_user_colperms;
++CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ 
+ COPY rls_t1 (a, b, c) from stdin;
+diff --git a/src/test/regress/sql/create_function_sql.sql b/src/test/regress/sql/create_function_sql.sql
+index 89e9af3a49..2b86fe2285 100644
+--- a/src/test/regress/sql/create_function_sql.sql
++++ b/src/test/regress/sql/create_function_sql.sql
+@@ -6,7 +6,7 @@
+ 
+ -- All objects made in this test are in temp_func_test schema
+ 
+-CREATE USER regress_unpriv_user;
++CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE SCHEMA temp_func_test;
+ GRANT ALL ON SCHEMA temp_func_test TO public;
+diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
+index d49ce9f300..47fa813bc8 100644
+--- a/src/test/regress/sql/create_index.sql
++++ b/src/test/regress/sql/create_index.sql
+@@ -71,7 +71,8 @@ CREATE TABLE fast_emp4000 (
+ );
+ 
+ \set filename :abs_srcdir '/data/rect.data'
+-COPY slow_emp4000 FROM :'filename';
++\set command '\\copy slow_emp4000 FROM ' :'filename';
++:command
+ 
+ INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000;
+ 
+@@ -269,7 +270,8 @@ CREATE TABLE array_index_op_test (
+ );
+ 
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_index_op_test FROM :'filename';
++\set command '\\copy array_index_op_test FROM ' :'filename';
++:command
+ ANALYZE array_index_op_test;
+ 
+ SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno;
+@@ -1246,7 +1248,7 @@ END;
+ REINDEX SCHEMA CONCURRENTLY schema_to_reindex;
+ 
+ -- Failure for unauthorized user
+-CREATE ROLE regress_reindexuser NOLOGIN;
++CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_reindexuser;
+ REINDEX SCHEMA schema_to_reindex;
+ -- Permission failures with toast tables and indexes (pg_authid here)
+diff --git a/src/test/regress/sql/create_procedure.sql b/src/test/regress/sql/create_procedure.sql
+index 069a3727ce..faeeb3f744 100644
+--- a/src/test/regress/sql/create_procedure.sql
++++ b/src/test/regress/sql/create_procedure.sql
+@@ -255,7 +255,7 @@ DROP PROCEDURE nonexistent();
+ 
+ -- privileges
+ 
+-CREATE USER regress_cp_user1;
++CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT INSERT ON cp_test TO regress_cp_user1;
+ REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC;
+ SET ROLE regress_cp_user1;
+diff --git a/src/test/regress/sql/create_role.sql b/src/test/regress/sql/create_role.sql
+index 4491a28a8a..3045434865 100644
+--- a/src/test/regress/sql/create_role.sql
++++ b/src/test/regress/sql/create_role.sql
+@@ -1,20 +1,20 @@
+ -- ok, superuser can create users with any set of privileges
+-CREATE ROLE regress_role_super SUPERUSER;
+-CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS;
++CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION;
+-CREATE ROLE regress_role_limited_admin CREATEROLE;
+-CREATE ROLE regress_role_normal;
++CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, CREATEROLE user can't give away role attributes without having them
+ SET SESSION AUTHORIZATION regress_role_limited_admin;
+-CREATE ROLE regress_nosuch_superuser SUPERUSER;
+-CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_nosuch_replication REPLICATION;
+-CREATE ROLE regress_nosuch_bypassrls BYPASSRLS;
+-CREATE ROLE regress_nosuch_createdb CREATEDB;
++CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can create a role without any special attributes
+-CREATE ROLE regress_role_limited;
++CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, can't give it in any of the restricted attributes
+ ALTER ROLE regress_role_limited SUPERUSER;
+@@ -25,10 +25,10 @@ DROP ROLE regress_role_limited;
+ 
+ -- ok, can give away these role attributes if you have them
+ SET SESSION AUTHORIZATION regress_role_admin;
+-CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_replication REPLICATION;
+-CREATE ROLE regress_bypassrls BYPASSRLS;
+-CREATE ROLE regress_createdb CREATEDB;
++CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can toggle these role attributes off and on if you have them
+ ALTER ROLE regress_replication NOREPLICATION;
+@@ -43,52 +43,52 @@ ALTER ROLE regress_createdb SUPERUSER;
+ ALTER ROLE regress_createdb NOSUPERUSER;
+ 
+ -- ok, having CREATEROLE is enough to create users with these privileges
+-CREATE ROLE regress_createrole CREATEROLE NOINHERIT;
++CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION;
+-CREATE ROLE regress_login LOGIN;
+-CREATE ROLE regress_inherit INHERIT;
+-CREATE ROLE regress_connection_limit CONNECTION LIMIT 5;
+-CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo';
+-CREATE ROLE regress_password_null PASSWORD NULL;
++CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, backwards compatible noise words should be ignored
+-CREATE ROLE regress_noiseword SYSID 12345;
++CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant membership in superuser role
+-CREATE ROLE regress_nosuch_super IN ROLE regress_role_super;
++CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, database owner cannot have members
+-CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner;
++CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can grant other users into a role
+ CREATE ROLE regress_inroles ROLE
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant a role into itself
+-CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive;
++CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can grant other users into a role with admin option
+ CREATE ROLE regress_adminroles ADMIN
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant a role into itself with admin option
+-CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive;
++CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, regress_createrole does not have CREATEDB privilege
+ SET SESSION AUTHORIZATION regress_createrole;
+ CREATE DATABASE regress_nosuch_db;
+ 
+ -- ok, regress_createrole can create new roles
+-CREATE ROLE regress_plainrole;
++CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, roles with CREATEROLE can create new roles with it
+-CREATE ROLE regress_rolecreator CREATEROLE;
++CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, roles with CREATEROLE can create new roles with different role
+ -- attributes, including CREATEROLE
+-CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5;
++CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, we should be able to modify a role we created
+ COMMENT ON ROLE regress_hasprivs IS 'some comment';
+@@ -123,7 +123,7 @@ REASSIGN OWNED BY regress_tenant TO regress_createrole;
+ 
+ -- ok, create a role with a value for createrole_self_grant
+ SET createrole_self_grant = 'set, inherit';
+-CREATE ROLE regress_tenant2;
++CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_tenant2;
+ 
+ -- ok, regress_tenant2 can create objects within the database
+@@ -150,16 +150,16 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2;
+ DROP TABLE tenant2_table;
+ 
+ -- fail, CREATEROLE is not enough to create roles in privileged roles
+-CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data;
+-CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data;
+-CREATE ROLE regress_monitor IN ROLE pg_monitor;
+-CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings;
+-CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats;
+-CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables;
+-CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files;
+-CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files;
+-CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program;
+-CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend;
++CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data;
++CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data;
++CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor;
++CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings;
++CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats;
++CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables;
++CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files;
++CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files;
++CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program;
++CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend;
+ 
+ -- fail, role still owns database objects
+ DROP ROLE regress_tenant;
+diff --git a/src/test/regress/sql/create_schema.sql b/src/test/regress/sql/create_schema.sql
+index 1b7064247a..be5b662ce1 100644
+--- a/src/test/regress/sql/create_schema.sql
++++ b/src/test/regress/sql/create_schema.sql
+@@ -4,7 +4,7 @@
+ 
+ -- Schema creation with elements.
+ 
+-CREATE ROLE regress_create_schema_role SUPERUSER;
++CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Cases where schema creation fails as objects are qualified with a schema
+ -- that does not match with what's expected.
+diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql
+index 3a78be1b0c..617d2dc8d6 100644
+--- a/src/test/regress/sql/create_view.sql
++++ b/src/test/regress/sql/create_view.sql
+@@ -23,7 +23,8 @@ CREATE TABLE real_city (
+ );
+ 
+ \set filename :abs_srcdir '/data/real_city.data'
+-COPY real_city FROM :'filename';
++\set command '\\copy real_city FROM ' :'filename';
++:command
+ ANALYZE real_city;
+ 
+ SELECT *
+diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql
+index 0367c0e37a..a23b98c4bd 100644
+--- a/src/test/regress/sql/database.sql
++++ b/src/test/regress/sql/database.sql
+@@ -1,8 +1,6 @@
+ CREATE DATABASE regression_tbd
+ 	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
+ ALTER DATABASE regression_tbd RENAME TO regression_utf8;
+-ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
+-ALTER DATABASE regression_utf8 RESET TABLESPACE;
+ ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
+ 
+ -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
+diff --git a/src/test/regress/sql/dependency.sql b/src/test/regress/sql/dependency.sql
+index 2559c62d0b..06c3aa1a36 100644
+--- a/src/test/regress/sql/dependency.sql
++++ b/src/test/regress/sql/dependency.sql
+@@ -2,10 +2,10 @@
+ -- DEPENDENCIES
+ --
+ 
+-CREATE USER regress_dep_user;
+-CREATE USER regress_dep_user2;
+-CREATE USER regress_dep_user3;
+-CREATE GROUP regress_dep_group;
++CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE deptest (f1 serial primary key, f2 text);
+ 
+@@ -45,9 +45,9 @@ DROP TABLE deptest;
+ DROP USER regress_dep_user3;
+ 
+ -- Test DROP OWNED
+-CREATE USER regress_dep_user0;
+-CREATE USER regress_dep_user1;
+-CREATE USER regress_dep_user2;
++CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_dep_user0;
+ -- permission denied
+ DROP OWNED BY regress_dep_user1;
+diff --git a/src/test/regress/sql/drop_if_exists.sql b/src/test/regress/sql/drop_if_exists.sql
+index ac6168b91f..4270062ec7 100644
+--- a/src/test/regress/sql/drop_if_exists.sql
++++ b/src/test/regress/sql/drop_if_exists.sql
+@@ -86,9 +86,9 @@ DROP DOMAIN test_domain_exists;
+ --- role/user/group
+ ---
+ 
+-CREATE USER regress_test_u1;
+-CREATE ROLE regress_test_r1;
+-CREATE GROUP regress_test_g1;
++CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ DROP USER regress_test_u2;
+ 
+diff --git a/src/test/regress/sql/equivclass.sql b/src/test/regress/sql/equivclass.sql
+index 247b0a3105..bf018fd3a1 100644
+--- a/src/test/regress/sql/equivclass.sql
++++ b/src/test/regress/sql/equivclass.sql
+@@ -230,7 +230,7 @@ set enable_mergejoin = off;
+ alter table ec1 enable row level security;
+ create policy p1 on ec1 using (f1 < '5'::int8alias1);
+ 
+-create user regress_user_ectest;
++create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select on ec0 to regress_user_ectest;
+ grant select on ec1 to regress_user_ectest;
+ 
+diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql
+index 1aeaddbe71..89a410ec4a 100644
+--- a/src/test/regress/sql/event_trigger.sql
++++ b/src/test/regress/sql/event_trigger.sql
+@@ -86,7 +86,7 @@ create event trigger regress_event_trigger2 on ddl_command_start
+ comment on event trigger regress_event_trigger is 'test comment';
+ 
+ -- drop as non-superuser should fail
+-create role regress_evt_user;
++create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_evt_user;
+ create event trigger regress_event_trigger_noperms on ddl_command_start
+    execute procedure test_event_trigger();
+diff --git a/src/test/regress/sql/foreign_data.sql b/src/test/regress/sql/foreign_data.sql
+index aa147b14a9..370e0dd570 100644
+--- a/src/test/regress/sql/foreign_data.sql
++++ b/src/test/regress/sql/foreign_data.sql
+@@ -22,14 +22,14 @@ DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_r
+ 
+ RESET client_min_messages;
+ 
+-CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER;
++CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_foreign_data_user';
+ 
+-CREATE ROLE regress_test_role;
+-CREATE ROLE regress_test_role2;
+-CREATE ROLE regress_test_role_super SUPERUSER;
+-CREATE ROLE regress_test_indirect;
+-CREATE ROLE regress_unprivileged_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE FOREIGN DATA WRAPPER dummy;
+ COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
+diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
+index 22e177f89b..7138d5e1d4 100644
+--- a/src/test/regress/sql/foreign_key.sql
++++ b/src/test/regress/sql/foreign_key.sql
+@@ -1418,7 +1418,7 @@ ALTER TABLE fk_partitioned_fk ATTACH PARTITION fk_partitioned_fk_2
+ -- leave these tables around intentionally
+ 
+ -- test the case when the referenced table is owned by a different user
+-create role regress_other_partitioned_fk_owner;
++create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner;
+ set role regress_other_partitioned_fk_owner;
+ create table other_partitioned_fk(a int, b int) partition by list (a);
+diff --git a/src/test/regress/sql/generated.sql b/src/test/regress/sql/generated.sql
+index 298f6b3aa8..f058913ae0 100644
+--- a/src/test/regress/sql/generated.sql
++++ b/src/test/regress/sql/generated.sql
+@@ -263,7 +263,7 @@ ALTER TABLE gtest10a DROP COLUMN b;
+ INSERT INTO gtest10a (a) VALUES (1);
+ 
+ -- privileges
+-CREATE USER regress_user11;
++CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED);
+ INSERT INTO gtest11s VALUES (1, 10), (2, 20);
+diff --git a/src/test/regress/sql/guc.sql b/src/test/regress/sql/guc.sql
+index dc79761955..a9ead75349 100644
+--- a/src/test/regress/sql/guc.sql
++++ b/src/test/regress/sql/guc.sql
+@@ -188,7 +188,7 @@ PREPARE foo AS SELECT 1;
+ LISTEN foo_event;
+ SET vacuum_cost_delay = 13;
+ CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS;
+-CREATE ROLE regress_guc_user;
++CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_guc_user;
+ -- look changes
+ SELECT pg_listening_channels();
+diff --git a/src/test/regress/sql/hash_index.sql b/src/test/regress/sql/hash_index.sql
+index 527024f710..de49c0b85f 100644
+--- a/src/test/regress/sql/hash_index.sql
++++ b/src/test/regress/sql/hash_index.sql
+@@ -26,10 +26,14 @@ CREATE TABLE hash_f8_heap (
+ );
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY hash_i4_heap FROM :'filename';
+-COPY hash_name_heap FROM :'filename';
+-COPY hash_txt_heap FROM :'filename';
+-COPY hash_f8_heap FROM :'filename';
++\set command '\\copy hash_i4_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_name_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_txt_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_f8_heap FROM ' :'filename';
++:command
+ 
+ -- the data in this file has a lot of duplicates in the index key
+ -- fields, leading to long bucket chains and lots of table expansion.
+diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql
+index 91d2e443b4..241c93f373 100644
+--- a/src/test/regress/sql/identity.sql
++++ b/src/test/regress/sql/identity.sql
+@@ -287,7 +287,7 @@ ALTER TABLE itest7 ALTER COLUMN a RESTART;
+ ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY;
+ 
+ -- privileges
+-CREATE USER regress_identity_user1;
++CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
+ SET ROLE regress_identity_user1;
+diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
+index fe699c54d5..bdd5993f45 100644
+--- a/src/test/regress/sql/inherit.sql
++++ b/src/test/regress/sql/inherit.sql
+@@ -950,7 +950,7 @@ create index on permtest_parent (left(c, 3));
+ insert into permtest_parent
+   select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
+ analyze permtest_parent;
+-create role regress_no_child_access;
++create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ revoke all on permtest_grandchild from regress_no_child_access;
+ grant select on permtest_parent to regress_no_child_access;
+ set session authorization regress_no_child_access;
+diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
+index 2b086eeb6d..913d8a0aed 100644
+--- a/src/test/regress/sql/insert.sql
++++ b/src/test/regress/sql/insert.sql
+@@ -513,7 +513,7 @@ drop table mlparted5;
+ create table key_desc (a int, b int) partition by list ((a+0));
+ create table key_desc_1 partition of key_desc for values in (1) partition by range (b);
+ 
+-create user regress_insert_other_user;
++create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select (a) on key_desc_1 to regress_insert_other_user;
+ grant insert on key_desc to regress_insert_other_user;
+ 
+@@ -597,7 +597,7 @@ insert into brtrigpartcon1 values (1, 'hi there');
+ -- check that the message shows the appropriate column description in a
+ -- situation where the partitioned table is not the primary ModifyTable node
+ create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int);
+-create role regress_coldesc_role;
++create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant insert on inserttest3 to regress_coldesc_role;
+ grant insert on brtrigpartcon to regress_coldesc_role;
+ revoke select on brtrigpartcon from regress_coldesc_role;
+diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql
+index 6dae715afd..aa320ba7be 100644
+--- a/src/test/regress/sql/jsonb.sql
++++ b/src/test/regress/sql/jsonb.sql
+@@ -6,7 +6,8 @@ CREATE TABLE testjsonb (
+ );
+ 
+ \set filename :abs_srcdir '/data/jsonb.data'
+-COPY testjsonb FROM :'filename';
++\set command '\\copy testjsonb FROM ' :'filename';
++:command
+ 
+ -- Strings.
+ SELECT '""'::jsonb;				-- OK.
+diff --git a/src/test/regress/sql/largeobject.sql b/src/test/regress/sql/largeobject.sql
+index a4aee02e3a..8839c9496a 100644
+--- a/src/test/regress/sql/largeobject.sql
++++ b/src/test/regress/sql/largeobject.sql
+@@ -10,7 +10,7 @@
+ SET bytea_output TO escape;
+ 
+ -- Test ALTER LARGE OBJECT OWNER
+-CREATE ROLE regress_lo_user;
++CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT lo_create(42);
+ ALTER LARGE OBJECT 42 OWNER TO regress_lo_user;
+ 
+@@ -189,7 +189,8 @@ SELECT lo_unlink(loid) from lotest_stash_values;
+ TRUNCATE lotest_stash_values;
+ 
+ \set filename :abs_srcdir '/data/tenk.data'
+-INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename');
++\lo_import :filename
++INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID);
+ 
+ BEGIN;
+ UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer));
+@@ -219,8 +220,8 @@ SELECT lo_close(fd) FROM lotest_stash_values;
+ END;
+ 
+ \set filename :abs_builddir '/results/lotest.txt'
+-SELECT lo_export(loid, :'filename') FROM lotest_stash_values;
+-
++SELECT loid FROM lotest_stash_values \gset
++\lo_export :loid, :filename
+ \lo_import :filename
+ 
+ \set newloid :LASTOID
+diff --git a/src/test/regress/sql/lock.sql b/src/test/regress/sql/lock.sql
+index b88488c6d0..78b31e6dd3 100644
+--- a/src/test/regress/sql/lock.sql
++++ b/src/test/regress/sql/lock.sql
+@@ -19,7 +19,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2;
+ CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1;
+ CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a);
+ CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub;
+-CREATE ROLE regress_rol_lock1;
++CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1;
+ GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1;
+ 
+diff --git a/src/test/regress/sql/matview.sql b/src/test/regress/sql/matview.sql
+index 235123de1e..58e73cec5d 100644
+--- a/src/test/regress/sql/matview.sql
++++ b/src/test/regress/sql/matview.sql
+@@ -209,7 +209,7 @@ SELECT * FROM mvtest_mv_v;
+ DROP TABLE mvtest_v CASCADE;
+ 
+ -- make sure running as superuser works when MV owned by another role (bug #11208)
+-CREATE ROLE regress_user_mvtest;
++CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_user_mvtest;
+ -- this test case also checks for ambiguity in the queries issued by
+ -- refresh_by_match_merge(), by choosing column names that intentionally
+@@ -264,7 +264,7 @@ ROLLBACK;
+ 
+ -- INSERT privileges if relation owner is not allowed to insert.
+ CREATE SCHEMA matview_schema;
+-CREATE USER regress_matview_user;
++CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user
+   REVOKE INSERT ON TABLES FROM regress_matview_user;
+ GRANT ALL ON SCHEMA matview_schema TO public;
+diff --git a/src/test/regress/sql/merge.sql b/src/test/regress/sql/merge.sql
+index 2a220a248f..91a404d51e 100644
+--- a/src/test/regress/sql/merge.sql
++++ b/src/test/regress/sql/merge.sql
+@@ -2,9 +2,9 @@
+ -- MERGE
+ --
+ 
+-CREATE USER regress_merge_privs;
+-CREATE USER regress_merge_no_privs;
+-CREATE USER regress_merge_none;
++CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ DROP TABLE IF EXISTS target;
+ DROP TABLE IF EXISTS source;
+diff --git a/src/test/regress/sql/misc.sql b/src/test/regress/sql/misc.sql
+index 165a2e175f..08d7096e2c 100644
+--- a/src/test/regress/sql/misc.sql
++++ b/src/test/regress/sql/misc.sql
+@@ -74,22 +74,26 @@ DROP TABLE tmp;
+ -- copy
+ --
+ \set filename :abs_builddir '/results/onek.data'
+-COPY onek TO :'filename';
++\set command '\\copy onek TO ' :'filename';
++:command
+ 
+ CREATE TEMP TABLE onek_copy (LIKE onek);
+ 
+-COPY onek_copy FROM :'filename';
++\set command '\\copy onek_copy FROM ' :'filename';
++:command
+ 
+ SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy;
+ 
+ SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek;
+ 
+ \set filename :abs_builddir '/results/stud_emp.data'
+-COPY BINARY stud_emp TO :'filename';
++\set command '\\COPY BINARY stud_emp TO ' :'filename';
++:command
+ 
+ CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp);
+ 
+-COPY BINARY stud_emp_copy FROM :'filename';
++\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename';
++:command
+ 
+ SELECT * FROM stud_emp_copy;
+ 
+diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql
+index b57f01f3e9..3e05aa6400 100644
+--- a/src/test/regress/sql/misc_functions.sql
++++ b/src/test/regress/sql/misc_functions.sql
+@@ -82,7 +82,7 @@ SELECT pg_log_backend_memory_contexts(pg_backend_pid());
+ SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity
+   WHERE backend_type = 'checkpointer';
+ 
+-CREATE ROLE regress_log_memory;
++CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SELECT has_function_privilege('regress_log_memory',
+   'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no
+@@ -169,7 +169,7 @@ select count(*) > 0 from
+ --
+ -- Test replication slot directory functions
+ --
+-CREATE ROLE regress_slot_dir_funcs;
++CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Not available by default.
+ SELECT has_function_privilege('regress_slot_dir_funcs',
+   'pg_ls_logicalsnapdir()', 'EXECUTE');
+diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql
+index 1a6c61f49d..1c31ac6a53 100644
+--- a/src/test/regress/sql/object_address.sql
++++ b/src/test/regress/sql/object_address.sql
+@@ -7,7 +7,7 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_addr_user;
+ RESET client_min_messages;
+ 
+-CREATE USER regress_addr_user;
++CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Test generic object addressing/identification functions
+ CREATE SCHEMA addr_nsp;
+diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql
+index 53e86b0b6c..f07cf1ec54 100644
+--- a/src/test/regress/sql/password.sql
++++ b/src/test/regress/sql/password.sql
+@@ -10,11 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok
+ 
+ -- consistency of password entries
+ SET password_encryption = 'md5';
+-CREATE ROLE regress_passwd1 PASSWORD 'role_pwd1';
+-CREATE ROLE regress_passwd2 PASSWORD 'role_pwd2';
++CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET password_encryption = 'scram-sha-256';
+-CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
+-CREATE ROLE regress_passwd4 PASSWORD NULL;
++CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- check list of created entries
+ --
+@@ -42,14 +42,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+ SET password_encryption = 'md5';
+ 
+ -- encrypt with MD5
+-ALTER ROLE regress_passwd2 PASSWORD 'foo';
++ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted, use as they are
+ ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+ ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
+ 
+ SET password_encryption = 'scram-sha-256';
+ -- create SCRAM secret
+-ALTER ROLE  regress_passwd4 PASSWORD 'foo';
++ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted with MD5, use as it is
+ CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+ 
+diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
+index 3f68cafcd1..004b26831d 100644
+--- a/src/test/regress/sql/privileges.sql
++++ b/src/test/regress/sql/privileges.sql
+@@ -24,18 +24,18 @@ RESET client_min_messages;
+ 
+ -- test proper begins here
+ 
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
+-CREATE USER regress_priv_user5;	-- duplicate
+-CREATE USER regress_priv_user6;
+-CREATE USER regress_priv_user7;
+-CREATE USER regress_priv_user8;
+-CREATE USER regress_priv_user9;
+-CREATE USER regress_priv_user10;
+-CREATE ROLE regress_priv_role;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;	-- duplicate
++CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- circular ADMIN OPTION grants should be disallowed
+ GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION;
+@@ -84,11 +84,11 @@ DROP ROLE regress_priv_user5; -- should fail, dependency
+ DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order
+ 
+ -- recreate the roles we just dropped
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT pg_read_all_data TO regress_priv_user6;
+ GRANT pg_write_all_data TO regress_priv_user7;
+@@ -130,8 +130,8 @@ DROP USER regress_priv_user10;
+ DROP USER regress_priv_user9;
+ DROP USER regress_priv_user8;
+ 
+-CREATE GROUP regress_priv_group1;
+-CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2;
++CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
+ 
+ ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
+ 
+@@ -1124,7 +1124,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+ 
+ -- security-restricted operations
+ \c -
+-CREATE ROLE regress_sro_user;
++CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Check that index expressions and predicates are run as the table's owner
+ 
+@@ -1620,8 +1620,8 @@ DROP SCHEMA testns CASCADE;
+ -- Change owner of the schema & and rename of new schema owner
+ \c -
+ 
+-CREATE ROLE regress_schemauser1 superuser login;
+-CREATE ROLE regress_schemauser2 superuser login;
++CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SET SESSION ROLE regress_schemauser1;
+ CREATE SCHEMA testns;
+@@ -1715,7 +1715,7 @@ DROP USER regress_priv_user8; -- does not exist
+ 
+ 
+ -- permissions with LOCK TABLE
+-CREATE USER regress_locktable_user;
++CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE lock_table (a int);
+ 
+ -- LOCK TABLE and SELECT permission
+@@ -1803,7 +1803,7 @@ DROP USER regress_locktable_user;
+ -- switch to superuser
+ \c -
+ 
+-CREATE ROLE regress_readallstats;
++CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
+ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+@@ -1823,10 +1823,10 @@ RESET ROLE;
+ DROP ROLE regress_readallstats;
+ 
+ -- test role grantor machinery
+-CREATE ROLE regress_group;
+-CREATE ROLE regress_group_direct_manager;
+-CREATE ROLE regress_group_indirect_manager;
+-CREATE ROLE regress_group_member;
++CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
+ GRANT regress_group_direct_manager TO regress_group_indirect_manager;
+@@ -1848,9 +1848,9 @@ DROP ROLE regress_group_indirect_manager;
+ DROP ROLE regress_group_member;
+ 
+ -- test SET and INHERIT options with object ownership changes
+-CREATE ROLE regress_roleoption_protagonist;
+-CREATE ROLE regress_roleoption_donor;
+-CREATE ROLE regress_roleoption_recipient;
++CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA regress_roleoption;
+ GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
+ GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
+diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql
+index f3bc6cd07e..f1a2f58069 100644
+--- a/src/test/regress/sql/psql.sql
++++ b/src/test/regress/sql/psql.sql
+@@ -496,7 +496,7 @@ select 1 where false;
+ \pset expanded off
+ 
+ CREATE SCHEMA tableam_display;
+-CREATE ROLE regress_display_role;
++CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER SCHEMA tableam_display OWNER TO regress_display_role;
+ SET search_path TO tableam_display;
+ CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler;
+@@ -1174,7 +1174,7 @@ select 1/(15-unique2) from tenk1 order by unique2 limit 19;
+ \unset FETCH_COUNT
+ 
+ create schema testpart;
+-create role regress_partitioning_role;
++create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ alter schema testpart owner to regress_partitioning_role;
+ 
+@@ -1285,7 +1285,7 @@ reset work_mem;
+ 
+ -- check \df+
+ -- we have to use functions with a predictable owner name, so make a role
+-create role regress_psql_user superuser;
++create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ begin;
+ set session authorization regress_psql_user;
+ 
+@@ -1431,11 +1431,14 @@ CREATE TEMPORARY TABLE reload_output(
+ );
+ 
+ SELECT 1 AS a \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ 
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+@@ -1452,17 +1455,20 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c;
+ -- COPY TO file
+ -- The data goes to :g_out_file and the status to :o_out_file
+ \set QUIET false
+-COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file';
++\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file';
++:command
+ -- DML command status
+ UPDATE onek SET unique1 = unique1 WHERE false;
+ \set QUIET true
+ \o
+ 
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+ 
+@@ -1475,10 +1481,12 @@ COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file
+ \o
+ 
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ 
+ DROP TABLE reload_output;
+@@ -1825,10 +1833,10 @@ DROP FUNCTION psql_error;
+ \dX "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ 
+ -- check \drg and \du
+-CREATE ROLE regress_du_role0;
+-CREATE ROLE regress_du_role1;
+-CREATE ROLE regress_du_role2;
+-CREATE ROLE regress_du_admin;
++CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE;
+diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql
+index d5051a5e74..b32d729271 100644
+--- a/src/test/regress/sql/publication.sql
++++ b/src/test/regress/sql/publication.sql
+@@ -1,9 +1,9 @@
+ --
+ -- PUBLICATION
+ --
+-CREATE ROLE regress_publication_user LOGIN SUPERUSER;
+-CREATE ROLE regress_publication_user2;
+-CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_publication_user';
+ 
+ -- suppress warning that depends on wal_level
+@@ -801,7 +801,7 @@ DROP PUBLICATION testpub2;
+ DROP PUBLICATION testpub3;
+ 
+ SET ROLE regress_publication_user;
+-CREATE ROLE regress_publication_user3;
++CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_publication_user2 TO regress_publication_user3;
+ SET client_min_messages = 'ERROR';
+ CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test;
+diff --git a/src/test/regress/sql/regproc.sql b/src/test/regress/sql/regproc.sql
+index de2aa881a8..41a675fd35 100644
+--- a/src/test/regress/sql/regproc.sql
++++ b/src/test/regress/sql/regproc.sql
+@@ -4,7 +4,7 @@
+ 
+ /* If objects exist, return oids */
+ 
+-CREATE ROLE regress_regrole_test;
++CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- without schemaname
+ 
+diff --git a/src/test/regress/sql/roleattributes.sql b/src/test/regress/sql/roleattributes.sql
+index c961b2d730..0859b89c4f 100644
+--- a/src/test/regress/sql/roleattributes.sql
++++ b/src/test/regress/sql/roleattributes.sql
+@@ -1,83 +1,83 @@
+ -- default for superuser is false
+-CREATE ROLE regress_test_def_superuser;
++CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
+-CREATE ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
++CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ ALTER ROLE regress_test_superuser WITH NOSUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ ALTER ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ 
+ -- default for inherit is true
+-CREATE ROLE regress_test_def_inherit;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
+-CREATE ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
++CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ ALTER ROLE regress_test_inherit WITH INHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ ALTER ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ 
+ -- default for create role is false
+-CREATE ROLE regress_test_def_createrole;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
+-CREATE ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
++CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ ALTER ROLE regress_test_createrole WITH NOCREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ ALTER ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ 
+ -- default for create database is false
+-CREATE ROLE regress_test_def_createdb;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
+-CREATE ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
++CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ ALTER ROLE regress_test_createdb WITH NOCREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ ALTER ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ 
+ -- default for can login is false for role
+-CREATE ROLE regress_test_def_role_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
+-CREATE ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
++CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ ALTER ROLE regress_test_role_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ ALTER ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ 
+ -- default for can login is true for user
+-CREATE USER regress_test_def_user_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
+-CREATE USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
++CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ ALTER USER regress_test_user_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ ALTER USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ 
+ -- default for replication is false
+-CREATE ROLE regress_test_def_replication;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
+-CREATE ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
++CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ ALTER ROLE regress_test_replication WITH NOREPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ ALTER ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ 
+ -- default for bypassrls is false
+-CREATE ROLE regress_test_def_bypassrls;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
+-CREATE ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
++CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ ALTER ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ 
+ -- clean up roles
+ DROP ROLE regress_test_def_superuser;
+diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
+index dec7340538..cdbc03a5cc 100644
+--- a/src/test/regress/sql/rowsecurity.sql
++++ b/src/test/regress/sql/rowsecurity.sql
+@@ -20,13 +20,13 @@ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE;
+ RESET client_min_messages;
+ 
+ -- initial setup
+-CREATE USER regress_rls_alice NOLOGIN;
+-CREATE USER regress_rls_bob NOLOGIN;
+-CREATE USER regress_rls_carol NOLOGIN;
+-CREATE USER regress_rls_dave NOLOGIN;
+-CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN;
+-CREATE ROLE regress_rls_group1 NOLOGIN;
+-CREATE ROLE regress_rls_group2 NOLOGIN;
++CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_rls_group1 TO regress_rls_bob;
+ GRANT regress_rls_group2 TO regress_rls_carol;
+@@ -2065,8 +2065,8 @@ SELECT count(*) = 0 FROM pg_depend
+ -- DROP OWNED BY testing
+ RESET SESSION AUTHORIZATION;
+ 
+-CREATE ROLE regress_rls_dob_role1;
+-CREATE ROLE regress_rls_dob_role2;
++CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE dob_t1 (c1 int);
+ CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1);
+diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql
+index 8b7e255dcd..c58d095c05 100644
+--- a/src/test/regress/sql/rules.sql
++++ b/src/test/regress/sql/rules.sql
+@@ -1356,7 +1356,7 @@ DROP TABLE ruletest2;
+ -- Test non-SELECT rule on security invoker view.
+ -- Should use view owner's permissions.
+ --
+-CREATE USER regress_rule_user1;
++CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE ruletest_t1 (x int);
+ CREATE TABLE ruletest_t2 (x int);
+diff --git a/src/test/regress/sql/security_label.sql b/src/test/regress/sql/security_label.sql
+index 98e6a5f211..68c868fef2 100644
+--- a/src/test/regress/sql/security_label.sql
++++ b/src/test/regress/sql/security_label.sql
+@@ -10,8 +10,8 @@ DROP ROLE IF EXISTS regress_seclabel_user2;
+ 
+ RESET client_min_messages;
+ 
+-CREATE USER regress_seclabel_user1 WITH CREATEROLE;
+-CREATE USER regress_seclabel_user2;
++CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE seclabel_tbl1 (a int, b text);
+ CREATE TABLE seclabel_tbl2 (x int, y text);
+diff --git a/src/test/regress/sql/select_into.sql b/src/test/regress/sql/select_into.sql
+index 689c448cc2..223ceb1d75 100644
+--- a/src/test/regress/sql/select_into.sql
++++ b/src/test/regress/sql/select_into.sql
+@@ -20,7 +20,7 @@ DROP TABLE sitmp1;
+ -- SELECT INTO and INSERT permission, if owner is not allowed to insert.
+ --
+ CREATE SCHEMA selinto_schema;
+-CREATE USER regress_selinto_user;
++CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
+ 	  REVOKE INSERT ON TABLES FROM regress_selinto_user;
+ GRANT ALL ON SCHEMA selinto_schema TO public;
+diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql
+index e742f13699..7bd0255df8 100644
+--- a/src/test/regress/sql/select_views.sql
++++ b/src/test/regress/sql/select_views.sql
+@@ -12,7 +12,7 @@ SELECT * FROM toyemp WHERE name = 'sharon';
+ --
+ -- Test for Leaky view scenario
+ --
+-CREATE ROLE regress_alice;
++CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE FUNCTION f_leak (text)
+        RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001
+diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql
+index 793f1415f6..ec07c1f193 100644
+--- a/src/test/regress/sql/sequence.sql
++++ b/src/test/regress/sql/sequence.sql
+@@ -293,7 +293,7 @@ ROLLBACK;
+ 
+ -- privileges tests
+ 
+-CREATE USER regress_seq_user;
++CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- nextval
+ BEGIN;
+diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql
+index 1e21e55c6d..2251f50c5e 100644
+--- a/src/test/regress/sql/stats.sql
++++ b/src/test/regress/sql/stats.sql
+@@ -622,23 +622,6 @@ SELECT :io_sum_shared_after_writes > :io_sum_shared_before_writes;
+ SELECT current_setting('fsync') = 'off'
+   OR :io_sum_shared_after_fsyncs > :io_sum_shared_before_fsyncs;
+ 
+--- Change the tablespace so that the table is rewritten directly, then SELECT
+--- from it to cause it to be read back into shared buffers.
+-SELECT sum(reads) AS io_sum_shared_before_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+--- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly
+--- rewritten table, e.g. by autovacuum.
+-BEGIN;
+-ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace;
+--- SELECT from the table so that the data is read into shared buffers and
+--- context 'normal', object 'relation' reads are counted.
+-SELECT COUNT(*) FROM test_io_shared;
+-COMMIT;
+-SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS io_sum_shared_after_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation'  \gset
+-SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads;
+-
+ SELECT sum(hits) AS io_sum_shared_before_hits
+   FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+ -- Select from the table again to count hits.
+diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
+index 1b80d3687b..4d8798b0b1 100644
+--- a/src/test/regress/sql/stats_ext.sql
++++ b/src/test/regress/sql/stats_ext.sql
+@@ -50,7 +50,7 @@ DROP TABLE ext_stats_test;
+ CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
+ CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment';
+-CREATE ROLE regress_stats_ext;
++CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_stats_ext;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment';
+ DROP STATISTICS ab1_a_b_stats;
+@@ -1607,7 +1607,7 @@ drop statistics stts_t1_expr_expr_stat;
+ set search_path to public, stts_s1;
+ \dX
+ 
+-create role regress_stats_ext nosuperuser;
++create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_stats_ext;
+ \dX
+ reset role;
+@@ -1618,7 +1618,7 @@ drop user regress_stats_ext;
+ reset search_path;
+ 
+ -- User with no access
+-CREATE USER regress_stats_user1;
++CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT USAGE ON SCHEMA tststats TO regress_stats_user1;
+ SET SESSION AUTHORIZATION regress_stats_user1;
+ SELECT * FROM tststats.priv_test_tbl; -- Permission denied
+diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql
+index 444e563ff3..1a538a98a0 100644
+--- a/src/test/regress/sql/subscription.sql
++++ b/src/test/regress/sql/subscription.sql
+@@ -2,10 +2,10 @@
+ -- SUBSCRIPTION
+ --
+ 
+-CREATE ROLE regress_subscription_user LOGIN SUPERUSER;
+-CREATE ROLE regress_subscription_user2;
+-CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription;
+-CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription;
++CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_subscription_user';
+ 
+ -- fail - no publications
+diff --git a/src/test/regress/sql/test_setup.sql b/src/test/regress/sql/test_setup.sql
+index 1b2d434683..b765c748b8 100644
+--- a/src/test/regress/sql/test_setup.sql
++++ b/src/test/regress/sql/test_setup.sql
+@@ -135,7 +135,8 @@ CREATE TABLE onek (
+ );
+ 
+ \set filename :abs_srcdir '/data/onek.data'
+-COPY onek FROM :'filename';
++\set command '\\copy onek FROM ' :'filename';
++:command
+ VACUUM ANALYZE onek;
+ 
+ CREATE TABLE onek2 AS SELECT * FROM onek;
+@@ -161,7 +162,8 @@ CREATE TABLE tenk1 (
+ );
+ 
+ \set filename :abs_srcdir '/data/tenk.data'
+-COPY tenk1 FROM :'filename';
++\set command '\\copy tenk1 FROM ' :'filename';
++:command
+ VACUUM ANALYZE tenk1;
+ 
+ CREATE TABLE tenk2 AS SELECT * FROM tenk1;
+@@ -174,7 +176,8 @@ CREATE TABLE person (
+ );
+ 
+ \set filename :abs_srcdir '/data/person.data'
+-COPY person FROM :'filename';
++\set command '\\copy person FROM ' :'filename';
++:command
+ VACUUM ANALYZE person;
+ 
+ CREATE TABLE emp (
+@@ -183,7 +186,8 @@ CREATE TABLE emp (
+ ) INHERITS (person);
+ 
+ \set filename :abs_srcdir '/data/emp.data'
+-COPY emp FROM :'filename';
++\set command '\\copy emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE emp;
+ 
+ CREATE TABLE student (
+@@ -191,7 +195,8 @@ CREATE TABLE student (
+ ) INHERITS (person);
+ 
+ \set filename :abs_srcdir '/data/student.data'
+-COPY student FROM :'filename';
++\set command '\\copy student FROM ' :'filename';
++:command
+ VACUUM ANALYZE student;
+ 
+ CREATE TABLE stud_emp (
+@@ -199,7 +204,8 @@ CREATE TABLE stud_emp (
+ ) INHERITS (emp, student);
+ 
+ \set filename :abs_srcdir '/data/stud_emp.data'
+-COPY stud_emp FROM :'filename';
++\set command '\\copy stud_emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE stud_emp;
+ 
+ CREATE TABLE road (
+@@ -208,7 +214,8 @@ CREATE TABLE road (
+ );
+ 
+ \set filename :abs_srcdir '/data/streets.data'
+-COPY road FROM :'filename';
++\set command '\\copy road FROM ' :'filename';
++:command
+ VACUUM ANALYZE road;
+ 
+ CREATE TABLE ihighway () INHERITS (road);
+diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
+index fbd26cdba4..7ec2d78eee 100644
+--- a/src/test/regress/sql/tsearch.sql
++++ b/src/test/regress/sql/tsearch.sql
+@@ -49,7 +49,8 @@ CREATE TABLE test_tsvector(
+ );
+ 
+ \set filename :abs_srcdir '/data/tsearch.data'
+-COPY test_tsvector FROM :'filename';
++\set command '\\copy test_tsvector FROM ' :'filename';
++:command
+ 
+ ANALYZE test_tsvector;
+ 
+diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql
+index 0a3176e25d..7744ef68f5 100644
+--- a/src/test/regress/sql/updatable_views.sql
++++ b/src/test/regress/sql/updatable_views.sql
+@@ -425,9 +425,9 @@ DROP TABLE base_tbl CASCADE;
+ 
+ -- permissions checks
+ 
+-CREATE USER regress_view_user1;
+-CREATE USER regress_view_user2;
+-CREATE USER regress_view_user3;
++CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SET SESSION AUTHORIZATION regress_view_user1;
+ CREATE TABLE base_tbl(a int, b text, c float);
+@@ -1586,8 +1586,8 @@ drop view uv_iocu_view;
+ drop table uv_iocu_tab;
+ 
+ -- ON CONFLICT DO UPDATE permissions checks
+-create user regress_view_user1;
+-create user regress_view_user2;
++create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ set session authorization regress_view_user1;
+ create table base_tbl(a int unique, b text, c float);
+diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
+index 7a7bee77b9..07b480cd59 100644
+--- a/src/test/regress/sql/update.sql
++++ b/src/test/regress/sql/update.sql
+@@ -339,7 +339,7 @@ DROP FUNCTION func_parted_mod_b();
+ -----------------------------------------
+ 
+ ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY;
+-CREATE USER regress_range_parted_user;
++CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT ALL ON range_parted, mintab TO regress_range_parted_user;
+ CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true);
+ CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0);
+diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql
+index ae36b54641..5612b8e162 100644
+--- a/src/test/regress/sql/vacuum.sql
++++ b/src/test/regress/sql/vacuum.sql
+@@ -335,7 +335,7 @@ CREATE TABLE vacowned (a int);
+ CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a);
+ CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1);
+ CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2);
+-CREATE ROLE regress_vacuum;
++CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_vacuum;
+ -- Simple table
+ VACUUM vacowned;
diff --git a/test_runner/cloud_regress/test_cloud_regress.py b/test_runner/cloud_regress/test_cloud_regress.py
new file mode 100644
index 000000000000..de71357232a8
--- /dev/null
+++ b/test_runner/cloud_regress/test_cloud_regress.py
@@ -0,0 +1,100 @@
+"""
+Run the regression tests on the cloud instance of Neon
+"""
+
+from pathlib import Path
+from typing import Any
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import RemotePostgres
+from fixtures.pg_version import PgVersion
+
+
+@pytest.fixture
+def setup(remote_pg: RemotePostgres):
+    """
+    Setup and teardown of the tests
+    """
+    with psycopg2.connect(remote_pg.connstr()) as conn:
+        with conn.cursor() as cur:
+            log.info("Creating the extension")
+            cur.execute("CREATE EXTENSION IF NOT EXISTS regress_so")
+            conn.commit()
+            # TODO: Migrate to branches and remove this code
+            log.info("Looking for subscriptions in the regress database")
+            cur.execute(
+                "SELECT subname FROM pg_catalog.pg_subscription WHERE "
+                "subdbid = (SELECT oid FROM pg_catalog.pg_database WHERE datname='regression');"
+            )
+            if cur.rowcount > 0:
+                with psycopg2.connect(
+                    dbname="regression",
+                    host=remote_pg.default_options["host"],
+                    user=remote_pg.default_options["user"],
+                    password=remote_pg.default_options["password"],
+                ) as regress_conn:
+                    with regress_conn.cursor() as regress_cur:
+                        for sub in cur:
+                            regress_cur.execute(f"ALTER SUBSCRIPTION {sub[0]} DISABLE")
+                            regress_cur.execute(
+                                f"ALTER SUBSCRIPTION {sub[0]} SET (slot_name = NONE)"
+                            )
+                            regress_cur.execute(f"DROP SUBSCRIPTION {sub[0]}")
+                        regress_conn.commit()
+
+    yield
+    # TODO: Migrate to branches and remove this code
+    log.info("Looking for extra roles...")
+    with psycopg2.connect(remote_pg.connstr()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                "SELECT rolname FROM pg_catalog.pg_roles WHERE oid > 16384 AND rolname <> 'neondb_owner'"
+            )
+            roles: list[Any] = []
+            for role in cur:
+                log.info("Role found: %s", role[0])
+                roles.append(role[0])
+            for role in roles:
+                cur.execute(f"DROP ROLE {role}")
+            conn.commit()
+
+
+@pytest.mark.timeout(7200)
+@pytest.mark.remote_cluster
+def test_cloud_regress(
+    setup,
+    remote_pg: RemotePostgres,
+    pg_version: PgVersion,
+    pg_distrib_dir: Path,
+    base_dir: Path,
+    test_output_dir: Path,
+):
+    """
+    Run the regression tests
+    """
+    regress_bin = (
+        pg_distrib_dir / f"{pg_version.v_prefixed}/lib/postgresql/pgxs/src/test/regress/pg_regress"
+    )
+    test_path = base_dir / f"vendor/postgres-{pg_version.v_prefixed}/src/test/regress"
+
+    env_vars = {
+        "PGHOST": remote_pg.default_options["host"],
+        "PGPORT": str(
+            remote_pg.default_options["port"] if "port" in remote_pg.default_options else 5432
+        ),
+        "PGUSER": remote_pg.default_options["user"],
+        "PGPASSWORD": remote_pg.default_options["password"],
+        "PGDATABASE": remote_pg.default_options["dbname"],
+    }
+    regress_cmd = [
+        str(regress_bin),
+        f"--inputdir={test_path}",
+        f"--bindir={pg_distrib_dir}/{pg_version.v_prefixed}/bin",
+        "--dlpath=/usr/local/lib",
+        "--max-concurrent-tests=20",
+        f"--schedule={test_path}/parallel_schedule",
+        "--max-connections=5",
+    ]
+    remote_pg.pg_bin.run(regress_cmd, env=env_vars, cwd=test_output_dir)
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 80f1c9e4e317..10e8412b1963 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -236,7 +236,7 @@ def get_scale_for_db(size_mb: int) -> int:
 
 
 ATTACHMENT_NAME_REGEX: re.Pattern = re.compile(  # type: ignore[type-arg]
-    r"regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
+    r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
 )
 
 

From 9490360df428aa7183034a396e8018607f3c4159 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 24 Sep 2024 10:03:41 +0100
Subject: [PATCH 47/77] storcon: improve initial shard scheduling (#9081)

## Problem

Scheduling on tenant creation uses different heuristics compared to the
scheduling done during
background optimizations. This results in scenarios where shards are
created and then immediately
migrated by the optimizer.

## Summary of changes

1. Make scheduler aware of the type of the shard it is scheduling
(attached vs secondary).
We wish to have different heuristics.
2. For attached shards, include the attached shard count from the
context in the node score
calculation. This brings initial shard scheduling in line with what the
optimization passes do.
3. Add a test for (2).

This looks like a bigger change than required, but the refactoring
serves as the basis for az-aware
shard scheduling where we also need to make the distinction between
attached and secondary shards.

Closes https://github.com/neondatabase/neon/issues/8969
---
 storage_controller/src/scheduler.rs    | 203 +++++++++++++++++++++----
 storage_controller/src/service.rs      |   8 +-
 storage_controller/src/tenant_shard.rs |  82 ++++++++--
 3 files changed, 247 insertions(+), 46 deletions(-)

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index deb5f2722628..1cb1fb104d60 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -2,7 +2,7 @@ use crate::{node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
 use pageserver_api::models::PageserverUtilization;
 use serde::Serialize;
-use std::collections::HashMap;
+use std::{collections::HashMap, fmt::Debug};
 use utils::{http::error::ApiError, id::NodeId};
 
 /// Scenarios in which we cannot find a suitable location for a tenant shard
@@ -27,7 +27,7 @@ pub enum MaySchedule {
 }
 
 #[derive(Serialize)]
-struct SchedulerNode {
+pub(crate) struct SchedulerNode {
     /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
     shard_count: usize,
     /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
@@ -38,6 +38,137 @@ struct SchedulerNode {
     may_schedule: MaySchedule,
 }
 
+pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self>;
+    fn is_overloaded(&self) -> bool;
+    fn node_id(&self) -> NodeId;
+}
+
+pub(crate) trait ShardTag {
+    type Score: NodeSchedulingScore;
+}
+
+pub(crate) struct AttachedShardTag {}
+impl ShardTag for AttachedShardTag {
+    type Score = NodeAttachmentSchedulingScore;
+}
+
+pub(crate) struct SecondaryShardTag {}
+impl ShardTag for SecondaryShardTag {
+    type Score = NodeSecondarySchedulingScore;
+}
+
+/// Scheduling score of a given node for shard attachments.
+/// Lower scores indicate more suitable nodes.
+/// Ordering is given by member declaration order (top to bottom).
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub(crate) struct NodeAttachmentSchedulingScore {
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
+    /// Size of [`ScheduleContext::attached_nodes`] for the current node.
+    /// This normally tracks the number of attached shards belonging to the
+    /// tenant being scheduled that are already on this node.
+    attached_shards_in_context: usize,
+    /// Utilisation score that combines shard count and disk utilisation
+    utilization_score: u64,
+    /// Total number of shards attached to this node. When nodes have identical utilisation, this
+    /// acts as an anti-affinity between attached shards.
+    total_attached_shard_count: usize,
+    /// Convenience to make selection deterministic in tests and empty systems
+    node_id: NodeId,
+}
+
+impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self> {
+        let utilization = match &mut node.may_schedule {
+            MaySchedule::Yes(u) => u,
+            MaySchedule::No => {
+                return None;
+            }
+        };
+
+        Some(Self {
+            affinity_score: context
+                .nodes
+                .get(node_id)
+                .copied()
+                .unwrap_or(AffinityScore::FREE),
+            attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
+            utilization_score: utilization.cached_score(),
+            total_attached_shard_count: node.attached_shard_count,
+            node_id: *node_id,
+        })
+    }
+
+    fn is_overloaded(&self) -> bool {
+        PageserverUtilization::is_overloaded(self.utilization_score)
+    }
+
+    fn node_id(&self) -> NodeId {
+        self.node_id
+    }
+}
+
+/// Scheduling score of a given node for shard secondaries.
+/// Lower scores indicate more suitable nodes.
+/// Ordering is given by member declaration order (top to bottom).
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub(crate) struct NodeSecondarySchedulingScore {
+    /// The number of shards belonging to the tenant currently being
+    /// scheduled that are attached to this node.
+    affinity_score: AffinityScore,
+    /// Utilisation score that combines shard count and disk utilisation
+    utilization_score: u64,
+    /// Total number of shards attached to this node. When nodes have identical utilisation, this
+    /// acts as an anti-affinity between attached shards.
+    total_attached_shard_count: usize,
+    /// Convenience to make selection deterministic in tests and empty systems
+    node_id: NodeId,
+}
+
+impl NodeSchedulingScore for NodeSecondarySchedulingScore {
+    fn generate(
+        node_id: &NodeId,
+        node: &mut SchedulerNode,
+        context: &ScheduleContext,
+    ) -> Option<Self> {
+        let utilization = match &mut node.may_schedule {
+            MaySchedule::Yes(u) => u,
+            MaySchedule::No => {
+                return None;
+            }
+        };
+
+        Some(Self {
+            affinity_score: context
+                .nodes
+                .get(node_id)
+                .copied()
+                .unwrap_or(AffinityScore::FREE),
+            utilization_score: utilization.cached_score(),
+            total_attached_shard_count: node.attached_shard_count,
+            node_id: *node_id,
+        })
+    }
+
+    fn is_overloaded(&self) -> bool {
+        PageserverUtilization::is_overloaded(self.utilization_score)
+    }
+
+    fn node_id(&self) -> NodeId {
+        self.node_id
+    }
+}
+
 impl PartialEq for SchedulerNode {
     fn eq(&self, other: &Self) -> bool {
         let may_schedule_matches = matches!(
@@ -406,6 +537,28 @@ impl Scheduler {
         node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
     }
 
+    /// Compute a schedulling score for each node that the scheduler knows of
+    /// minus a set of hard excluded nodes.
+    fn compute_node_scores<Score>(
+        &mut self,
+        hard_exclude: &[NodeId],
+        context: &ScheduleContext,
+    ) -> Vec<Score>
+    where
+        Score: NodeSchedulingScore,
+    {
+        self.nodes
+            .iter_mut()
+            .filter_map(|(k, v)| {
+                if hard_exclude.contains(k) {
+                    None
+                } else {
+                    Score::generate(k, v, context)
+                }
+            })
+            .collect()
+    }
+
     /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
     /// are already in use by this shard -- we use this to avoid picking the same node
     /// as both attached and secondary location.  This is a hard constraint: if we cannot
@@ -415,7 +568,7 @@ impl Scheduler {
     /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
     /// the same tenant on the same node.  This is a soft constraint: the context will never
     /// cause us to fail to schedule a shard.
-    pub(crate) fn schedule_shard(
+    pub(crate) fn schedule_shard<Tag: ShardTag>(
         &mut self,
         hard_exclude: &[NodeId],
         context: &ScheduleContext,
@@ -424,20 +577,7 @@ impl Scheduler {
             return Err(ScheduleError::NoPageservers);
         }
 
-        let mut scores: Vec<(NodeId, AffinityScore, u64, usize)> = self
-            .nodes
-            .iter_mut()
-            .filter_map(|(k, v)| match &mut v.may_schedule {
-                MaySchedule::No => None,
-                MaySchedule::Yes(_) if hard_exclude.contains(k) => None,
-                MaySchedule::Yes(utilization) => Some((
-                    *k,
-                    context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                    utilization.cached_score(),
-                    v.attached_shard_count,
-                )),
-            })
-            .collect();
+        let mut scores = self.compute_node_scores::<Tag::Score>(hard_exclude, context);
 
         // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
         // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
@@ -445,20 +585,18 @@ impl Scheduler {
         // overloaded.
         let non_overloaded_scores = scores
             .iter()
-            .filter(|i| !PageserverUtilization::is_overloaded(i.2))
+            .filter(|i| !i.is_overloaded())
             .copied()
             .collect::<Vec<_>>();
         if !non_overloaded_scores.is_empty() {
             scores = non_overloaded_scores;
         }
 
-        // Sort by, in order of precedence:
-        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization score (this combines shard count and disk utilization)
-        //  3rd: Attached shard count.  When nodes have identical utilization (e.g. when populating some
-        //       empty nodes), this acts as an anti-affinity between attached shards.
-        //  4th: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.3, i.0));
+        // Sort the nodes by score. The one with the lowest scores will be the preferred node.
+        // Refer to [`NodeAttachmentSchedulingScore`] for attached locations and
+        // [`NodeSecondarySchedulingScore`] for secondary locations to understand how the nodes
+        // are ranked.
+        scores.sort();
 
         if scores.is_empty() {
             // After applying constraints, no pageservers were left.
@@ -481,12 +619,12 @@ impl Scheduler {
         }
 
         // Lowest score wins
-        let node_id = scores.first().unwrap().0;
+        let node_id = scores.first().unwrap().node_id();
 
         if !matches!(context.mode, ScheduleMode::Speculative) {
             tracing::info!(
             "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+            scores.iter().map(|i| i.node_id().0).collect::<Vec<_>>()
         );
         }
 
@@ -556,9 +694,9 @@ mod tests {
 
         let context = ScheduleContext::default();
 
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &context)?;
         t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &context)?;
         t2_intent.set_attached(&mut scheduler, Some(scheduled));
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
@@ -567,7 +705,8 @@ mod tests {
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
 
-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
+        let scheduled =
+            scheduler.schedule_shard::<AttachedShardTag>(&t1_intent.all_pageservers(), &context)?;
         t1_intent.push_secondary(&mut scheduler, scheduled);
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
@@ -621,7 +760,9 @@ mod tests {
             scheduler: &mut Scheduler,
             context: &ScheduleContext,
         ) {
-            let scheduled = scheduler.schedule_shard(&[], context).unwrap();
+            let scheduled = scheduler
+                .schedule_shard::<AttachedShardTag>(&[], context)
+                .unwrap();
             let mut intent = IntentState::new();
             intent.set_attached(scheduler, Some(scheduled));
             scheduled_intents.push(intent);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 957f633feb41..5555505b81d9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -26,7 +26,7 @@ use crate::{
         ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
+    scheduler::{AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
         ScheduleOptimizationAction,
@@ -2629,7 +2629,8 @@ impl Service {
             let scheduler = &mut locked.scheduler;
             // Right now we only perform the operation on a single node without parallelization
             // TODO fan out the operation to multiple nodes for better performance
-            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&[], &ScheduleContext::default())?;
             let node = locked
                 .nodes
                 .get(&node_id)
@@ -2815,7 +2816,8 @@ impl Service {
 
             // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
             // was attached, just has to be able to see the S3 content)
-            let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&[], &ScheduleContext::default())?;
             let node = nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while lock is active");
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index cdb0633e2bc6..1f5eb423be8b 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -8,7 +8,10 @@ use crate::{
     metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
     persistence::TenantShardPersistence,
     reconciler::{ReconcileUnits, ReconcilerConfig},
-    scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
+    scheduler::{
+        AffinityScore, AttachedShardTag, MaySchedule, RefCountUpdate, ScheduleContext,
+        SecondaryShardTag,
+    },
     service::ReconcileResultRequest,
 };
 use pageserver_api::controller_api::{
@@ -335,19 +338,19 @@ pub(crate) enum ReconcileWaitError {
     Failed(TenantShardId, Arc<ReconcileError>),
 }
 
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct ReplaceSecondary {
     old_node_id: NodeId,
     new_node_id: NodeId,
 }
 
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct MigrateAttachment {
     pub(crate) old_attached_node_id: NodeId,
     pub(crate) new_attached_node_id: NodeId,
 }
 
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) enum ScheduleOptimizationAction {
     // Replace one of our secondary locations with a different node
     ReplaceSecondary(ReplaceSecondary),
@@ -355,7 +358,7 @@ pub(crate) enum ScheduleOptimizationAction {
     MigrateAttachment(MigrateAttachment),
 }
 
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Clone)]
 pub(crate) struct ScheduleOptimization {
     // What was the reconcile sequence when we generated this optimization?  The optimization
     // should only be applied if the shard's sequence is still at this value, in case other changes
@@ -537,7 +540,8 @@ impl TenantShard {
             Ok((true, promote_secondary))
         } else {
             // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
+            let node_id =
+                scheduler.schedule_shard::<AttachedShardTag>(&self.intent.secondary, context)?;
             tracing::debug!("Selected {} as attached", node_id);
             self.intent.set_attached(scheduler, Some(node_id));
             Ok((true, node_id))
@@ -613,7 +617,8 @@ impl TenantShard {
 
                 let mut used_pageservers = vec![attached_node_id];
                 while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
+                    let node_id = scheduler
+                        .schedule_shard::<SecondaryShardTag>(&used_pageservers, context)?;
                     self.intent.push_secondary(scheduler, node_id);
                     used_pageservers.push(node_id);
                     modified = true;
@@ -626,7 +631,7 @@ impl TenantShard {
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[], context)?;
+                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(&[], context)?;
                     self.intent.push_secondary(scheduler, node_id);
                     modified = true;
                 }
@@ -803,9 +808,10 @@ impl TenantShard {
             // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
             // This implicitly limits the choice to nodes that are available, and prefers nodes
             // with lower utilization.
-            let Ok(candidate_node) =
-                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
-            else {
+            let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
+                &self.intent.all_pageservers(),
+                schedule_context,
+            ) else {
                 // A scheduling error means we have no possible candidate replacements
                 continue;
             };
@@ -1333,6 +1339,8 @@ impl TenantShard {
 
 #[cfg(test)]
 pub(crate) mod tests {
+    use std::{cell::RefCell, rc::Rc};
+
     use pageserver_api::{
         controller_api::NodeAvailability,
         shard::{ShardCount, ShardNumber},
@@ -1637,12 +1645,14 @@ pub(crate) mod tests {
 
     // Optimize til quiescent: this emulates what Service::optimize_all does, when
     // called repeatedly in the background.
+    // Returns the applied optimizations
     fn optimize_til_idle(
         nodes: &HashMap<NodeId, Node>,
         scheduler: &mut Scheduler,
         shards: &mut [TenantShard],
-    ) {
+    ) -> Vec<ScheduleOptimization> {
         let mut loop_n = 0;
+        let mut optimizations = Vec::default();
         loop {
             let mut schedule_context = ScheduleContext::default();
             let mut any_changed = false;
@@ -1657,6 +1667,7 @@ pub(crate) mod tests {
             for shard in shards.iter_mut() {
                 let optimization = shard.optimize_attachment(nodes, &schedule_context);
                 if let Some(optimization) = optimization {
+                    optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
                     break;
@@ -1664,6 +1675,7 @@ pub(crate) mod tests {
 
                 let optimization = shard.optimize_secondary(scheduler, &schedule_context);
                 if let Some(optimization) = optimization {
+                    optimizations.push(optimization.clone());
                     shard.apply_optimization(scheduler, optimization);
                     any_changed = true;
                     break;
@@ -1678,6 +1690,8 @@ pub(crate) mod tests {
             loop_n += 1;
             assert!(loop_n < 1000);
         }
+
+        optimizations
     }
 
     /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
@@ -1730,4 +1744,48 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    /// Test that initial shard scheduling is optimal. By optimal we mean
+    /// that the optimizer cannot find a way to improve it.
+    ///
+    /// This test is an example of the scheduling issue described in
+    /// https://github.com/neondatabase/neon/issues/8969
+    #[test]
+    fn initial_scheduling_is_optimal() -> anyhow::Result<()> {
+        use itertools::Itertools;
+
+        let nodes = make_test_nodes(2);
+
+        let mut scheduler = Scheduler::new([].iter());
+        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
+        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
+
+        let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let a_context = Rc::new(RefCell::new(ScheduleContext::default()));
+
+        let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let b_context = Rc::new(RefCell::new(ScheduleContext::default()));
+
+        let a_shards_with_context = a.iter_mut().map(|shard| (shard, a_context.clone()));
+        let b_shards_with_context = b.iter_mut().map(|shard| (shard, b_context.clone()));
+
+        let schedule_order = a_shards_with_context.interleave(b_shards_with_context);
+
+        for (shard, context) in schedule_order {
+            let context = &mut *context.borrow_mut();
+            shard.schedule(&mut scheduler, context).unwrap();
+        }
+
+        let applied_to_a = optimize_til_idle(&nodes, &mut scheduler, &mut a);
+        assert_eq!(applied_to_a, vec![]);
+
+        let applied_to_b = optimize_til_idle(&nodes, &mut scheduler, &mut b);
+        assert_eq!(applied_to_b, vec![]);
+
+        for shard in a.iter_mut().chain(b.iter_mut()) {
+            shard.intent.clear(&mut scheduler);
+        }
+
+        Ok(())
+    }
 }

From 2b65a2b53eb065a7f664564adbfa04e74d422b9c Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 24 Sep 2024 11:52:25 +0200
Subject: [PATCH 48/77] proxy: check if IP is allowed during webauth flow
 (#9101)

neondatabase/cloud#12018
---
 proxy/src/auth/backend.rs     |  2 +-
 proxy/src/auth/backend/web.rs | 10 ++++++++++
 proxy/src/console/messages.rs | 19 +++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 5bc2f2ff65b4..4e9f4591addc 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -444,7 +444,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
             Self::Web(url, ()) => {
                 info!("performing web authentication");
 
-                let info = web::authenticate(ctx, &url, client).await?;
+                let info = web::authenticate(ctx, config, &url, client).await?;
 
                 Backend::Web(url, info)
             }
diff --git a/proxy/src/auth/backend/web.rs b/proxy/src/auth/backend/web.rs
index 58a4bef62ea9..05f437355e64 100644
--- a/proxy/src/auth/backend/web.rs
+++ b/proxy/src/auth/backend/web.rs
@@ -1,5 +1,6 @@
 use crate::{
     auth, compute,
+    config::AuthenticationConfig,
     console::{self, provider::NodeInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
@@ -58,6 +59,7 @@ pub(crate) fn new_psql_session_id() -> String {
 
 pub(super) async fn authenticate(
     ctx: &RequestMonitoring,
+    auth_config: &'static AuthenticationConfig,
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
@@ -89,6 +91,14 @@ pub(super) async fn authenticate(
     info!(parent: &span, "waiting for console's reply...");
     let db_info = waiter.await.map_err(WebAuthError::from)?;
 
+    if auth_config.ip_allowlist_check_enabled {
+        if let Some(allowed_ips) = &db_info.allowed_ips {
+            if !auth::check_peer_addr_is_in_list(&ctx.peer_addr(), allowed_ips) {
+                return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr()));
+            }
+        }
+    }
+
     client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
 
     // This config should be self-contained, because we won't
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index 9b66333cd473..85683acb82df 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -284,6 +284,8 @@ pub(crate) struct DatabaseInfo {
     /// be inconvenient for debug with local PG instance.
     pub(crate) password: Option<Box<str>>,
     pub(crate) aux: MetricsAuxInfo,
+    #[serde(default)]
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -294,6 +296,7 @@ impl fmt::Debug for DatabaseInfo {
             .field("port", &self.port)
             .field("dbname", &self.dbname)
             .field("user", &self.user)
+            .field("allowed_ips", &self.allowed_ips)
             .finish_non_exhaustive()
     }
 }
@@ -432,6 +435,22 @@ mod tests {
             "aux": dummy_aux(),
         }))?;
 
+        // with allowed_ips
+        let dbinfo = serde_json::from_value::<DatabaseInfo>(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+            "aux": dummy_aux(),
+            "allowed_ips": ["127.0.0.1"],
+        }))?;
+
+        assert_eq!(
+            dbinfo.allowed_ips,
+            Some(vec![IpPattern::Single("127.0.0.1".parse()?)])
+        );
+
         Ok(())
     }
 

From fc67f8dc6087a0b4f4f0bcd74f6e1dc25fab8cf3 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 24 Sep 2024 14:15:52 +0200
Subject: [PATCH 49/77] Update PostgreSQL 17 from 17rc1 to 17.0 (#9119)

The PostgreSQL 17 vendor module is now based on postgres/postgres @
d7ec59a63d745ba74fba0e280bbf85dc6d1caa3e, presumably the final code
change before the V17 tag.
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 7b3e52c75ca3..68b5038f27e4 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 7b3e52c75ca384de9c69477c158b1f5dcdcbb4be
+Subproject commit 68b5038f27e493bde6ae552fe066f10cbdfe6a14
diff --git a/vendor/revisions.json b/vendor/revisions.json
index bc7070744a26..896a75814e93 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
-    "17rc1",
-    "7b3e52c75ca384de9c69477c158b1f5dcdcbb4be"
+    "17.0",
+    "68b5038f27e493bde6ae552fe066f10cbdfe6a14"
   ],
   "v16": [
     "16.4",

From a65d4379309e29a23f9e3544988712b33a89a75a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 24 Sep 2024 15:05:07 +0200
Subject: [PATCH 50/77] chore(#9077): cleanups & code dedup (#9082)

Punted from https://github.com/neondatabase/neon/pull/9077
---
 pageserver/src/metrics.rs      | 33 +++++++++++++--------------------
 pageserver/src/tenant/tasks.rs |  3 +--
 2 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 162e8d1836ff..366bd8290340 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3208,45 +3208,38 @@ pub(crate) mod tenant_throttling {
 
     impl TimelineGet {
         pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
+            let per_tenant_label_values = &[
+                KIND,
+                &tenant_shard_id.tenant_id.to_string(),
+                &tenant_shard_id.shard_slug().to_string(),
+            ];
             TimelineGet {
                 count_accounted_start: {
                     GlobalAndPerTenantIntCounter {
                         global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
-                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_accounted_finish: {
                     GlobalAndPerTenantIntCounter {
                         global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
-                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                     }
                 },
                 wait_time: {
                     GlobalAndPerTenantIntCounter {
                         global: WAIT_USECS.with_label_values(&[KIND]),
-                        per_tenant: WAIT_USECS_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: WAIT_USECS_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_throttled: {
                     GlobalAndPerTenantIntCounter {
                         global: WAIT_COUNT.with_label_values(&[KIND]),
-                        per_tenant: WAIT_COUNT_PER_TENANT.with_label_values(&[
-                            KIND,
-                            &tenant_shard_id.tenant_id.to_string(),
-                            &tenant_shard_id.shard_slug().to_string(),
-                        ]),
+                        per_tenant: WAIT_COUNT_PER_TENANT
+                            .with_label_values(per_tenant_label_values),
                     }
                 },
             }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 341febb30ab9..3f0f8a21c8a5 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -481,8 +481,7 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
                 let allowed_rps = tenant.timeline_get_throttle.steady_rps();
                 let delta = now - prev;
                 info!(
-                    n_seconds=%format_args!("{:.3}",
-                    delta.as_secs_f64()),
+                    n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
                     count_accounted = count_accounted_finish,  // don't break existing log scraping
                     count_throttled,
                     sum_throttled_usecs,

From b224a5a37734d05ffc88143750352eb318cba90d Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:13:18 +0200
Subject: [PATCH 51/77] Move the patch to compute (#9120)

## Problem
All the other patches were moved to the compute directory, and only one
was left in the patches subdirectory in the root directory.

## Summary of changes
The patch was moved to the compute directory as others
---
 .github/workflows/cloud-regress.yml                   | 2 +-
 {patches => compute/patches}/cloud_regress_pg16.patch | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename {patches => compute/patches}/cloud_regress_pg16.patch (100%)

diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index de6babdde39a..ecafe183f8a8 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Patch the test
         run: |
           cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
-          patch -p1 < "../../patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
+          patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
 
       - name: Generate a random password
         id: pwgen
diff --git a/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch
similarity index 100%
rename from patches/cloud_regress_pg16.patch
rename to compute/patches/cloud_regress_pg16.patch

From 70fe0075192d5bc4cbfec5f472ca466d0df477b9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 24 Sep 2024 16:41:59 +0300
Subject: [PATCH 52/77] test: Make test_hot_standby_feedback more forgiving of
 slow initialization (#9113)

Don't start waiting for the index to appear in the secondary until it
has been created in the primary. Before, if the "pgbench -i" step took
more than 60 s, we would give up.

There was a flaky test failure along those lines at:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9105/10997477941/index.html#suites/950eff205b552e248417890b8b8f189e/73cf4b5648fa6f74/
Hopefully, this avoids such failures in the future.
---
 test_runner/regress/test_hot_standby.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 35e0c0decb26..be8f70bb7076 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -198,9 +198,6 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
 
 def run_pgbench(connstr: str, pg_bin: PgBin):
     log.info(f"Start a pgbench workload on pg {connstr}")
-    # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
-    pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", connstr])
-    log.info("pgbench init done")
     pg_bin.run_capture(["pgbench", "-T60", connstr])
 
 
@@ -247,9 +244,15 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             log.info(
                 f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
             )
+
+            # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
+            pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", primary.connstr()])
+            log.info("pgbench init done in primary")
+
             t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
             t.start()
-            # Wait until pgbench_accounts is created + filled on replica *and*
+
+            # Wait until we see that the pgbench_accounts is created + filled on replica *and*
             # index is created. Otherwise index creation would conflict with
             # read queries and hs feedback won't save us.
             wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))

From 589594c2e1447632b28d31ec69602782ce4634d7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 20:48:41 +0300
Subject: [PATCH 53/77] test: Skip fsync when initdb'ing the storage controller
 db

After initdb, we configure it with "fsync=off" anyway.
---
 control_plane/src/storage_controller.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 2b714fbfbf10..0c0e67dff057 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -346,7 +346,14 @@ impl StorageController {
             let pg_log_path = pg_data_path.join("postgres.log");
 
             if !tokio::fs::try_exists(&pg_data_path).await? {
-                let initdb_args = ["-D", pg_data_path.as_ref(), "--username", &username()];
+                let initdb_args = [
+                    "-D",
+                    pg_data_path.as_ref(),
+                    "--username",
+                    &username(),
+                    "--no-sync",
+                    "--no-instructions",
+                ];
                 tracing::info!(
                     "Initializing storage controller database with args: {:?}",
                     initdb_args

From 2f7cecaf6a92e29df0a576b793820899e889ba81 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 23 Sep 2024 20:48:43 +0300
Subject: [PATCH 54/77] test: Poll pageserver availability more aggressively at
 test startup

Even with the 100 ms interval, on my laptop the pageserver always
becomes available on second attempt, so this saves about 900 ms at
every test startup.
---
 test_runner/fixtures/neon_fixtures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 55c1423ed0d5..8c178ae63a50 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2553,7 +2553,7 @@ def poll_node_status(
         desired_availability: Optional[PageserverAvailability],
         desired_scheduling_policy: Optional[PageserverSchedulingPolicy],
         max_attempts: int,
-        backoff: int,
+        backoff: float,
     ):
         """
         Poll the node status until it reaches 'desired_scheduling_policy' and 'desired_availability'
@@ -2948,7 +2948,7 @@ def start(
             self.id
         ):
             self.env.storage_controller.poll_node_status(
-                self.id, PageserverAvailability.ACTIVE, None, max_attempts=20, backoff=1
+                self.id, PageserverAvailability.ACTIVE, None, max_attempts=200, backoff=0.1
             )
 
         return self

From 4f67b0225bb946c32f5b9c8d1d96eafbb05295ca Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:41:38 -0400
Subject: [PATCH 55/77] pageserver: handle decompression outside vectored
 `read_blobs` (#8942)

Part of #8130.

## Problem

Currently, decompression is performed within the `read_blobs`
implementation and the decompressed blob will be appended to the end of
the `BytesMut` buffer. We will lose this flexibility of extending the
buffer when we switch to using our own dio-aligned buffer (WIP in
https://github.com/neondatabase/neon/pull/8730). To facilitate the
adoption of aligned buffer, we need to refactor the code to perform
decompression outside `read_blobs`.

## Summary of changes

- `VectoredBlobReader::read_blobs` will return `VectoredBlob` without
performing decompression and appending decompressed blob. It becomes the
caller's responsibility to decompress the buffer.
- Added a new `BufView` type that functions as `Cow<Bytes, &[u8]>`.
- Perform decompression within `VectoredBlob::read` so that people don't
have to explicitly thinking about compression when using the reader
interface.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |  58 +++++--
 .../src/tenant/storage_layer/image_layer.rs   |  41 +++--
 pageserver/src/tenant/vectored_blob_io.rs     | 162 ++++++++++++++----
 3 files changed, 200 insertions(+), 61 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 34f1b15138ec..2b212cfed5d7 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -39,7 +39,7 @@ use crate::tenant::disk_btree::{
 use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
     VectoredReadCoalesceMode, VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
@@ -1021,13 +1021,30 @@ impl DeltaLayerInner {
                     continue;
                 }
             };
-
+            let view = BufView::new_slice(&blobs_buf.buf);
             for meta in blobs_buf.blobs.iter().rev() {
                 if Some(meta.meta.key) == ignore_key_with_err {
                     continue;
                 }
+                let blob_read = meta.read(&view).await;
+                let blob_read = match blob_read {
+                    Ok(buf) => buf,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::Other(anyhow!(e).context(format!(
+                                "Failed to decompress blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                let value = Value::des(&blob_read);
 
-                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
                 let value = match value {
                     Ok(v) => v,
                     Err(e) => {
@@ -1243,21 +1260,21 @@ impl DeltaLayerInner {
                 buf.reserve(read.size());
                 let res = reader.read_blobs(&read, buf, ctx).await?;
 
+                let view = BufView::new_slice(&res.buf);
+
                 for blob in res.blobs {
                     let key = blob.meta.key;
                     let lsn = blob.meta.lsn;
-                    let data = &res.buf[blob.start..blob.end];
+
+                    let data = blob.read(&view).await?;
 
                     #[cfg(debug_assertions)]
-                    Value::des(data)
+                    Value::des(&data)
                         .with_context(|| {
                             format!(
-                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
-                                blob.meta.key,
-                                blob.meta.lsn,
-                                blob.start,
-                                blob.end,
-                                utils::Hex(data)
+                                "blob failed to deserialize for {}: {:?}",
+                                blob,
+                                utils::Hex(&data)
                             )
                         })
                         .unwrap();
@@ -1265,15 +1282,15 @@ impl DeltaLayerInner {
                     // is it an image or will_init walrecord?
                     // FIXME: this could be handled by threading the BlobRef to the
                     // VectoredReadBuilder
-                    let will_init = crate::repository::ValueBytes::will_init(data)
+                    let will_init = crate::repository::ValueBytes::will_init(&data)
                         .inspect_err(|_e| {
                             #[cfg(feature = "testing")]
-                            tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
+                            tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
                         })
                         .unwrap_or(false);
 
                     per_blob_copy.clear();
-                    per_blob_copy.extend_from_slice(data);
+                    per_blob_copy.extend_from_slice(&data);
 
                     let (tmp, res) = writer
                         .put_value_bytes(
@@ -1538,8 +1555,11 @@ impl<'a> DeltaLayerIterator<'a> {
             .read_blobs(&plan, buf, self.ctx)
             .await?;
         let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
         for meta in blobs_buf.blobs.iter() {
-            let value = Value::des(&frozen_buf[meta.start..meta.end])?;
+            let blob_read = meta.read(&view).await?;
+            let value = Value::des(&blob_read)?;
+
             next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
         }
         self.key_values_batch = next_batch;
@@ -1916,9 +1936,13 @@ pub(crate) mod test {
                 let blobs_buf = vectored_blob_reader
                     .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
                     .await?;
+                let view = BufView::new_slice(&blobs_buf.buf);
                 for meta in blobs_buf.blobs.iter() {
-                    let value = &blobs_buf.buf[meta.start..meta.end];
-                    assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]);
+                    let value = meta.read(&view).await?;
+                    assert_eq!(
+                        &value[..],
+                        &entries_meta.index[&(meta.meta.key, meta.meta.lsn)]
+                    );
                 }
 
                 buf = Some(blobs_buf.buf);
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 5de2582ab79f..940d169db096 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -36,7 +36,8 @@ use crate::tenant::disk_btree::{
 };
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::vectored_blob_io::{
-    BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
+    VectoredReadPlanner,
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
@@ -547,15 +548,15 @@ impl ImageLayerInner {
 
             let buf = BytesMut::with_capacity(buf_size);
             let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
             let frozen_buf = blobs_buf.buf.freeze();
+            let view = BufView::new_bytes(frozen_buf);
 
             for meta in blobs_buf.blobs.iter() {
-                let img_buf = frozen_buf.slice(meta.start..meta.end);
+                let img_buf = meta.read(&view).await?;
 
                 key_count += 1;
                 writer
-                    .put_image(meta.meta.key, img_buf, ctx)
+                    .put_image(meta.meta.key, img_buf.into_bytes(), ctx)
                     .await
                     .context(format!("Storing key {}", meta.meta.key))?;
             }
@@ -602,13 +603,28 @@ impl ImageLayerInner {
             match res {
                 Ok(blobs_buf) => {
                     let frozen_buf = blobs_buf.buf.freeze();
-
+                    let view = BufView::new_bytes(frozen_buf);
                     for meta in blobs_buf.blobs.iter() {
-                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        let img_buf = meta.read(&view).await;
+
+                        let img_buf = match img_buf {
+                            Ok(img_buf) => img_buf,
+                            Err(e) => {
+                                reconstruct_state.on_key_error(
+                                    meta.meta.key,
+                                    PageReconstructError::Other(anyhow!(e).context(format!(
+                                        "Failed to decompress blob from virtual file {}",
+                                        self.file.path,
+                                    ))),
+                                );
+
+                                continue;
+                            }
+                        };
                         reconstruct_state.update_key(
                             &meta.meta.key,
                             self.lsn,
-                            Value::Image(img_buf),
+                            Value::Image(img_buf.into_bytes()),
                         );
                     }
                 }
@@ -1025,10 +1041,15 @@ impl<'a> ImageLayerIterator<'a> {
         let blobs_buf = vectored_blob_reader
             .read_blobs(&plan, buf, self.ctx)
             .await?;
-        let frozen_buf: Bytes = blobs_buf.buf.freeze();
+        let frozen_buf = blobs_buf.buf.freeze();
+        let view = BufView::new_bytes(frozen_buf);
         for meta in blobs_buf.blobs.iter() {
-            let img_buf = frozen_buf.slice(meta.start..meta.end);
-            next_batch.push_back((meta.meta.key, self.image_layer.lsn, Value::Image(img_buf)));
+            let img_buf = meta.read(&view).await?;
+            next_batch.push_back((
+                meta.meta.key,
+                self.image_layer.lsn,
+                Value::Image(img_buf.into_bytes()),
+            ));
         }
         self.key_values_batch = next_batch;
         Ok(())
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 553edf6d8b34..aa37a45898bd 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -16,8 +16,9 @@
 //! Note that the vectored blob api does *not* go through the page cache.
 
 use std::collections::BTreeMap;
+use std::ops::Deref;
 
-use bytes::BytesMut;
+use bytes::{Bytes, BytesMut};
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -35,11 +36,123 @@ pub struct BlobMeta {
     pub lsn: Lsn,
 }
 
-/// Blob offsets into [`VectoredBlobsBuf::buf`]
+/// A view into the vectored blobs read buffer.
+#[derive(Clone, Debug)]
+pub(crate) enum BufView<'a> {
+    Slice(&'a [u8]),
+    Bytes(bytes::Bytes),
+}
+
+impl<'a> BufView<'a> {
+    /// Creates a new slice-based view on the blob.
+    pub fn new_slice(slice: &'a [u8]) -> Self {
+        Self::Slice(slice)
+    }
+
+    /// Creates a new [`bytes::Bytes`]-based view on the blob.
+    pub fn new_bytes(bytes: bytes::Bytes) -> Self {
+        Self::Bytes(bytes)
+    }
+
+    /// Convert the view into `Bytes`.
+    ///
+    /// If using slice as the underlying storage, the copy will be an O(n) operation.
+    pub fn into_bytes(self) -> Bytes {
+        match self {
+            BufView::Slice(slice) => Bytes::copy_from_slice(slice),
+            BufView::Bytes(bytes) => bytes,
+        }
+    }
+
+    /// Creates a sub-view of the blob based on the range.
+    fn view(&self, range: std::ops::Range<usize>) -> Self {
+        match self {
+            BufView::Slice(slice) => BufView::Slice(&slice[range]),
+            BufView::Bytes(bytes) => BufView::Bytes(bytes.slice(range)),
+        }
+    }
+}
+
+impl<'a> Deref for BufView<'a> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            BufView::Slice(slice) => slice,
+            BufView::Bytes(bytes) => bytes,
+        }
+    }
+}
+
+impl<'a> AsRef<[u8]> for BufView<'a> {
+    fn as_ref(&self) -> &[u8] {
+        match self {
+            BufView::Slice(slice) => slice,
+            BufView::Bytes(bytes) => bytes.as_ref(),
+        }
+    }
+}
+
+impl<'a> From<&'a [u8]> for BufView<'a> {
+    fn from(value: &'a [u8]) -> Self {
+        Self::new_slice(value)
+    }
+}
+
+impl From<Bytes> for BufView<'_> {
+    fn from(value: Bytes) -> Self {
+        Self::new_bytes(value)
+    }
+}
+
+/// Blob offsets into [`VectoredBlobsBuf::buf`]. The byte ranges is potentially compressed,
+/// subject to [`VectoredBlob::compression_bits`].
 pub struct VectoredBlob {
-    pub start: usize,
-    pub end: usize,
+    /// Blob metadata.
     pub meta: BlobMeta,
+    /// Start offset.
+    start: usize,
+    /// End offset.
+    end: usize,
+    /// Compression used on the the blob.
+    compression_bits: u8,
+}
+
+impl VectoredBlob {
+    /// Reads a decompressed view of the blob.
+    pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
+        let view = buf.view(self.start..self.end);
+
+        match self.compression_bits {
+            BYTE_UNCOMPRESSED => Ok(view),
+            BYTE_ZSTD => {
+                let mut decompressed_vec = Vec::new();
+                let mut decoder =
+                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
+                decoder.write_all(&view).await?;
+                decoder.flush().await?;
+                // Zero-copy conversion from `Vec` to `Bytes`
+                Ok(BufView::new_bytes(Bytes::from(decompressed_vec)))
+            }
+            bits => {
+                let error = std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}", self.meta.key, self.meta.lsn, self.start, self.end),
+                );
+                Err(error)
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for VectoredBlob {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}@{}, {}..{}",
+            self.meta.key, self.meta.lsn, self.start, self.end
+        )
+    }
 }
 
 /// Return type of [`VectoredBlobReader::read_blobs`]
@@ -514,7 +627,7 @@ impl<'a> VectoredBlobReader<'a> {
             );
         }
 
-        let mut buf = self
+        let buf = self
             .file
             .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
             .await?
@@ -529,9 +642,6 @@ impl<'a> VectoredBlobReader<'a> {
         // of a blob is implicit: the start of the next blob if one exists
         // or the end of the read.
 
-        // Some scratch space, put here for reusing the allocation
-        let mut decompressed_vec = Vec::new();
-
         for (blob_start, meta) in blobs_at {
             let blob_start_in_buf = blob_start - start_offset;
             let first_len_byte = buf[blob_start_in_buf as usize];
@@ -557,35 +667,14 @@ impl<'a> VectoredBlobReader<'a> {
                 )
             };
 
-            let start_raw = blob_start_in_buf + size_length;
-            let end_raw = start_raw + blob_size;
-            let (start, end);
-            if compression_bits == BYTE_UNCOMPRESSED {
-                start = start_raw as usize;
-                end = end_raw as usize;
-            } else if compression_bits == BYTE_ZSTD {
-                let mut decoder =
-                    async_compression::tokio::write::ZstdDecoder::new(&mut decompressed_vec);
-                decoder
-                    .write_all(&buf[start_raw as usize..end_raw as usize])
-                    .await?;
-                decoder.flush().await?;
-                start = buf.len();
-                buf.extend_from_slice(&decompressed_vec);
-                end = buf.len();
-                decompressed_vec.clear();
-            } else {
-                let error = std::io::Error::new(
-                    std::io::ErrorKind::InvalidData,
-                    format!("invalid compression byte {compression_bits:x}"),
-                );
-                return Err(error);
-            }
+            let start = (blob_start_in_buf + size_length) as usize;
+            let end = start + blob_size as usize;
 
             metas.push(VectoredBlob {
                 start,
                 end,
                 meta: *meta,
+                compression_bits,
             });
         }
 
@@ -1020,8 +1109,13 @@ mod tests {
             let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
             assert_eq!(result.blobs.len(), 1);
             let read_blob = &result.blobs[0];
-            let read_buf = &result.buf[read_blob.start..read_blob.end];
-            assert_eq!(blob, read_buf, "mismatch for idx={idx} at offset={offset}");
+            let view = BufView::new_slice(&result.buf);
+            let read_buf = read_blob.read(&view).await?;
+            assert_eq!(
+                &blob[..],
+                &read_buf[..],
+                "mismatch for idx={idx} at offset={offset}"
+            );
             buf = result.buf;
         }
         Ok(())

From c47f355ec1d35401d227f02518c24bb19d051085 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 24 Sep 2024 19:28:56 +0200
Subject: [PATCH 56/77] Catch Cancelled and don't print a warning for it
 (#9121)

In the `imitate_synthetic_size_calculation_worker` function, we might
obtain the `Cancelled` error variant instead of hitting the cancellation
token based path. Therefore, catch `Cancelled` and handle it analogously
to the cancellation case.

Fixes #8886.
---
 pageserver/src/tenant/timeline/eviction_task.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 2f6cb4d73a69..26c2861b9308 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,8 +30,8 @@ use crate::{
     pgdatadir_mapping::CollectKeySpaceError,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
     tenant::{
-        storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
-        LogicalSizeCalculationCause, Tenant,
+        size::CalculateSyntheticSizeError, storage_layer::LayerVisibilityHint,
+        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
     },
 };
 
@@ -557,6 +557,8 @@ impl Timeline {
             gather_result = gather => {
                 match gather_result {
                     Ok(_) => {},
+                    // It can happen sometimes that we hit this instead of the cancellation token firing above
+                    Err(CalculateSyntheticSizeError::Cancelled) => {}
                     Err(e) => {
                         // We don't care about the result, but, if it failed, we should log it,
                         // since consumption metric might be hitting the cached value and

From 523cf71721128ad6f58bfce3952fb33fe0086a8c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 24 Sep 2024 19:11:31 +0100
Subject: [PATCH 57/77] Fix compiler warnings on macOS (#9128)

## Problem

Compilation of neon extension on macOS produces a warning
```
pgxn/neon/neon_perf_counters.c:50:1: error: non-void function does not return a value [-Werror,-Wreturn-type]
```

## Summary of changes
- Change the return type of `NeonPerfCountersShmemInit` to void
---
 pgxn/neon/neon_perf_counters.c | 2 +-
 pgxn/neon/neon_perf_counters.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c
index 3e86d5b26276..de653826c019 100644
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -32,7 +32,7 @@ NeonPerfCountersShmemSize(void)
 	return size;
 }
 
-bool
+void
 NeonPerfCountersShmemInit(void)
 {
 	bool		found;
diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h
index ae35e8c3a515..02163ada5571 100644
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -105,7 +105,7 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared;
 extern void inc_getpage_wait(uint64 latency);
 
 extern Size NeonPerfCountersShmemSize(void);
-extern bool NeonPerfCountersShmemInit(void);
+extern void NeonPerfCountersShmemInit(void);
 
 
 #endif							/* NEON_PERF_COUNTERS_H */

From af5c54ed14f34dfee477659af39628c0d7ec3502 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 24 Sep 2024 23:38:16 +0300
Subject: [PATCH 58/77] test: Make test_lfc_resize more robust (#9117)

1. Increase statement_timeout. It defaults to 120 s, which is not quite
enough on slow or busy systems with debug build. On my laptop, the index
creation takes about 100 s. On buildfarm, we've seen failures, e.g:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9084/10997888708/index.html#suites/821f97908a487f1d7d3a2a4dd1571e99/db1834bddfe8c5b9/

2. Keep twiddling the LFC size through the whole test. Before, we would
do it for the first 10 seconds, but that only covers a small part of the
pgbench initialization phase. Change the loop so that the pgbench run
time determines how long the test runs, and we keep changing the LFC for
the whole time.

In the passing, also fix bogus test description, copy-pasted from a
completely unrelated test.
---
 test_runner/regress/test_lfc_resize.py | 45 +++++++++++++++++++-------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py
index cb0b30d9c6e8..0f791e924707 100644
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -10,11 +10,11 @@
 from fixtures.neon_fixtures import NeonEnv, PgBin
 
 
-#
-# Test branching, when a transaction is in prepared state
-#
 @pytest.mark.timeout(600)
 def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
+    """
+    Test resizing the Local File Cache
+    """
     env = neon_simple_env
     endpoint = env.endpoints.create_start(
         "main",
@@ -32,27 +32,48 @@ def run_pgbench(connstr: str):
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
         pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
 
-    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
+    # Initializing the pgbench database can be very slow, especially on debug builds.
+    connstr = endpoint.connstr(options="-cstatement_timeout=300s")
+
+    thread = threading.Thread(target=run_pgbench, args=(connstr,), daemon=True)
     thread.start()
 
     conn = endpoint.connect()
     cur = conn.cursor()
 
-    for _ in range(n_resize):
+    # For as long as pgbench is running, twiddle the LFC size once a second.
+    # Note that we launch this immediately, already while the "pgbench -i"
+    # initialization step is still running. That's quite a different workload
+    # than the actual pgbench benchamark run, so this gives us coverage of both.
+    while thread.is_alive():
         size = random.randint(1, 512)
         cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
         cur.execute("select pg_reload_conf()")
         time.sleep(1)
+    thread.join()
 
+    # At the end, set it at 100 MB, and perform a final check that the disk usage
+    # of the file is in that ballbark.
+    #
+    # We retry the check a few times, because it might take a while for the
+    # system to react to changing the setting and shrinking the file.
     cur.execute("alter system set neon.file_cache_size_limit='100MB'")
     cur.execute("select pg_reload_conf()")
+    nretries = 10
+    while True:
+        lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
+        lfc_file_size = os.path.getsize(lfc_file_path)
+        res = subprocess.run(
+            ["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
+        )
+        lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
+        log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
+        assert lfc_file_size <= 512 * 1024 * 1024
 
-    thread.join()
+        if int(lfc_file_blocks) <= 128 * 1024 or nretries == 0:
+            break
+
+        nretries = nretries - 1
+        time.sleep(1)
 
-    lfc_file_path = f"{endpoint.pg_data_dir_path()}/file.cache"
-    lfc_file_size = os.path.getsize(lfc_file_path)
-    res = subprocess.run(["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True)
-    lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
-    log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
-    assert lfc_file_size <= 512 * 1024 * 1024
     assert int(lfc_file_blocks) <= 128 * 1024

From 5cbf5b45ae337cc643812a2e6bb76e6eb79142e4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 24 Sep 2024 23:58:54 +0300
Subject: [PATCH 59/77] Remove TenantState::Loading (#9118)

The last real use was removed in commit de90bf4663. It was still used in
a few unit tests, but they can use Attaching too.
---
 libs/pageserver_api/src/models.rs | 27 +++++----------------------
 pageserver/src/tenant.rs          | 21 +++++----------------
 2 files changed, 10 insertions(+), 38 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index c9be53f0b0c0..45abda0ad85d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -37,14 +37,11 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 /// ```mermaid
 /// stateDiagram-v2
 ///
-///     [*] --> Loading: spawn_load()
 ///     [*] --> Attaching: spawn_attach()
 ///
-///     Loading --> Activating: activate()
 ///     Attaching --> Activating: activate()
 ///     Activating --> Active: infallible
 ///
-///     Loading --> Broken: load() failure
 ///     Attaching --> Broken: attach() failure
 ///
 ///     Active --> Stopping: set_stopping(), part of shutdown & detach
@@ -68,10 +65,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 )]
 #[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    /// This tenant is being loaded from local disk.
-    ///
-    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
-    Loading,
     /// This tenant is being attached to the pageserver.
     ///
     /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
@@ -121,8 +114,6 @@ impl TenantState {
             // But, our attach task might still be fetching the remote timelines, etc.
             // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
             Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
-            // tenant mgr startup distinguishes attaching from loading via marker file.
-            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
             // We only reach Active after successful load / attach.
             // So, call atttachment status Attached.
             Self::Active => Attached,
@@ -191,10 +182,11 @@ impl LsnLease {
 }
 
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
+///
+/// XXX: We used to have more variants here, but now it's just one, which makes this rather
+/// useless. Remove, once we've checked that there's no client code left that looks at this.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
-    /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
-    Loading,
     /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
     Attaching,
 }
@@ -1562,11 +1554,8 @@ mod tests {
 
     #[test]
     fn tenantstatus_activating_serde() {
-        let states = [
-            TenantState::Activating(ActivatingFrom::Loading),
-            TenantState::Activating(ActivatingFrom::Attaching),
-        ];
-        let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
+        let states = [TenantState::Activating(ActivatingFrom::Attaching)];
+        let expected = "[{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
 
         let actual = serde_json::to_string(&states).unwrap();
 
@@ -1581,13 +1570,7 @@ mod tests {
     fn tenantstatus_activating_strum() {
         // tests added, because we use these for metrics
         let examples = [
-            (line!(), TenantState::Loading, "Loading"),
             (line!(), TenantState::Attaching, "Attaching"),
-            (
-                line!(),
-                TenantState::Activating(ActivatingFrom::Loading),
-                "Activating",
-            ),
             (
                 line!(),
                 TenantState::Activating(ActivatingFrom::Attaching),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5ed63734f494..53cbaea621eb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1968,9 +1968,6 @@ impl Tenant {
                 TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                     panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                 }
-                TenantState::Loading => {
-                    *current_state = TenantState::Activating(ActivatingFrom::Loading);
-                }
                 TenantState::Attaching => {
                     *current_state = TenantState::Activating(ActivatingFrom::Attaching);
                 }
@@ -2151,7 +2148,7 @@ impl Tenant {
     async fn set_stopping(
         &self,
         progress: completion::Barrier,
-        allow_transition_from_loading: bool,
+        _allow_transition_from_loading: bool,
         allow_transition_from_attaching: bool,
     ) -> Result<(), SetStoppingError> {
         let mut rx = self.state.subscribe();
@@ -2166,7 +2163,6 @@ impl Tenant {
                 );
                 false
             }
-            TenantState::Loading => allow_transition_from_loading,
             TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
         })
         .await
@@ -2185,13 +2181,6 @@ impl Tenant {
                 *current_state = TenantState::Stopping { progress };
                 true
             }
-            TenantState::Loading => {
-                if !allow_transition_from_loading {
-                    unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
-                };
-                *current_state = TenantState::Stopping { progress };
-                true
-            }
             TenantState::Active => {
                 // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                 // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -2247,7 +2236,7 @@ impl Tenant {
         // The load & attach routines own the tenant state until it has reached `Active`.
         // So, wait until it's done.
         rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                 info!(
                     "waiting for {} to turn Active|Broken|Stopping",
                     <&'static str>::from(state)
@@ -2267,7 +2256,7 @@ impl Tenant {
         let reason = reason.to_string();
         self.state.send_modify(|current_state| {
             match *current_state {
-                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                TenantState::Activating(_) | TenantState::Attaching => {
                     unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
                 }
                 TenantState::Active => {
@@ -2311,7 +2300,7 @@ impl Tenant {
         loop {
             let current_state = receiver.borrow_and_update().clone();
             match current_state {
-                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
+                TenantState::Attaching | TenantState::Activating(_) => {
                     // in these states, there's a chance that we can reach ::Active
                     self.activate_now();
                     match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
@@ -4144,7 +4133,7 @@ pub(crate) mod harness {
             let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
 
             let tenant = Arc::new(Tenant::new(
-                TenantState::Loading,
+                TenantState::Attaching,
                 self.conf,
                 AttachedTenantConf::try_from(LocationConf::attached_single(
                     TenantConfOpt::from(self.tenant_conf.clone()),

From 938b163b42d614ecc747931e35380b27bf6e1e62 Mon Sep 17 00:00:00 2001
From: Damian972 <25445518+Damian972@users.noreply.github.com>
Date: Wed, 25 Sep 2024 00:05:23 +0200
Subject: [PATCH 60/77] chore(docker-compose): fix typo in readme (#9133)

Typo in the readme inside docker-compose folder

## Summary of changes
- Update the readme
---
 docker-compose/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker-compose/README.md b/docker-compose/README.md
index bd47805a6791..648e4ca030c6 100644
--- a/docker-compose/README.md
+++ b/docker-compose/README.md
@@ -2,8 +2,8 @@
 # Example docker compose configuration
 
 The configuration in this directory is used for testing Neon docker images: it is
-not intended for deploying a usable system.  To run a development environment where
-you can experiment with a minature Neon system, use `cargo neon` rather than container images.
+not intended for deploying a usable system. To run a development environment where
+you can experiment with a miniature Neon system, use `cargo neon` rather than container images.
 
 This configuration does not start the storage controller, because the controller
 needs a way to reconfigure running computes, and no such thing exists in this setup.

From 5f2f31e87933be05bd93a239ddc66764ff877546 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 24 Sep 2024 18:33:03 -0400
Subject: [PATCH 61/77] fix(test): storage scrubber should only log to stdout
 with info (#9067)

As @koivunej mentioned in the storage channel, for regress test, we
don't need to create a log file for the scrubber, and we should reduce
noisy logs.

## Summary of changes

* Disable log file creation for storage scrubber
* Only log at info level

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8c178ae63a50..201eb1087ded 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4617,7 +4617,8 @@ def scrubber_cli(
             "REGION": s3_storage.bucket_region,
             "BUCKET": s3_storage.bucket_name,
             "BUCKET_PREFIX": s3_storage.prefix_in_bucket,
-            "RUST_LOG": "DEBUG",
+            "RUST_LOG": "INFO",
+            "PAGESERVER_DISABLE_FILE_LOGGING": "1",
         }
         env.update(s3_storage.access_env_vars())
 
@@ -4637,10 +4638,8 @@ def scrubber_cli(
         (output_path, stdout, status_code) = subprocess_capture(
             self.log_dir,
             args,
-            echo_stderr=True,
-            echo_stdout=True,
             env=env,
-            check=False,
+            check=True,
             capture_stdout=True,
             timeout=timeout,
         )

From a26cc29d92c8626750a994cb7d50fb796214f51e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 25 Sep 2024 10:16:06 +0100
Subject: [PATCH 62/77] storcon: add tags to scheduler logs (#9127)

We log something at info level each time we schedule a shard to a
non-secondary location.

Might as well have context for it.
---
 storage_controller/src/tenant_shard.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 1f5eb423be8b..eccde0e3ab60 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -548,6 +548,11 @@ impl TenantShard {
         }
     }
 
+    #[instrument(skip_all, fields(
+        tenant_id=%self.tenant_shard_id.tenant_id,
+        shard_id=%self.tenant_shard_id.shard_slug(),
+        sequence=%self.sequence
+    ))]
     pub(crate) fn schedule(
         &mut self,
         scheduler: &mut Scheduler,

From 7dcfcccf7cf03e65dfea6b7b5884f34da1686660 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 25 Sep 2024 14:38:35 +0200
Subject: [PATCH 63/77] Re-export git-version from utils and remove as direct
 dep (#9138)

---
 Cargo.lock                       | 10 +---------
 control_plane/Cargo.toml         |  1 -
 libs/utils/Cargo.toml            |  1 +
 libs/utils/src/lib.rs            |  6 +++++-
 pageserver/Cargo.toml            |  1 -
 pageserver/compaction/Cargo.toml |  1 -
 pageserver/ctl/Cargo.toml        |  1 -
 proxy/Cargo.toml                 |  1 -
 safekeeper/Cargo.toml            |  1 -
 storage_broker/Cargo.toml        |  1 -
 storage_controller/Cargo.toml    |  1 -
 storage_scrubber/Cargo.toml      |  1 -
 12 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e4dbd8b33398..d0702e09d412 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1321,7 +1321,6 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
- "git-version",
  "humantime",
  "humantime-serde",
  "hyper 0.14.30",
@@ -3578,7 +3577,6 @@ dependencies = [
  "anyhow",
  "camino",
  "clap",
- "git-version",
  "humantime",
  "pageserver",
  "pageserver_api",
@@ -3617,7 +3615,6 @@ dependencies = [
  "enumset",
  "fail",
  "futures",
- "git-version",
  "hex",
  "hex-literal",
  "humantime",
@@ -3737,7 +3734,6 @@ dependencies = [
  "clap",
  "criterion",
  "futures",
- "git-version",
  "hex-literal",
  "itertools 0.10.5",
  "once_cell",
@@ -4307,7 +4303,6 @@ dependencies = [
  "fallible-iterator",
  "framed-websockets",
  "futures",
- "git-version",
  "hashbrown 0.14.5",
  "hashlink",
  "hex",
@@ -5139,7 +5134,6 @@ dependencies = [
  "desim",
  "fail",
  "futures",
- "git-version",
  "hex",
  "humantime",
  "hyper 0.14.30",
@@ -5702,7 +5696,6 @@ dependencies = [
  "futures",
  "futures-core",
  "futures-util",
- "git-version",
  "humantime",
  "hyper 0.14.30",
  "metrics",
@@ -5730,7 +5723,6 @@ dependencies = [
  "diesel_migrations",
  "fail",
  "futures",
- "git-version",
  "hex",
  "humantime",
  "hyper 0.14.30",
@@ -5783,7 +5775,6 @@ dependencies = [
  "either",
  "futures",
  "futures-util",
- "git-version",
  "hex",
  "humantime",
  "itertools 0.10.5",
@@ -6715,6 +6706,7 @@ dependencies = [
  "criterion",
  "fail",
  "futures",
+ "git-version",
  "hex",
  "hex-literal",
  "humantime",
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index c185d20484a4..df87c181bf47 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,7 +9,6 @@ anyhow.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-git-version.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index f199b155540f..7d284a6fc567 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -19,6 +19,7 @@ bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
+git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 03fb36caf8b6..aacc1e1dd5e8 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -92,6 +92,10 @@ pub mod toml_edit_ext;
 
 pub mod circuit_breaker;
 
+// Re-export used in macro. Avoids adding git-version as dep in target crates.
+#[doc(hidden)]
+pub use git_version;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
@@ -131,7 +135,7 @@ macro_rules! project_git_version {
     ($const_identifier:ident) => {
         // this should try GIT_VERSION first only then git_version::git_version!
         const $const_identifier: &::core::primitive::str = {
-            const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
+            const __COMMIT_FROM_GIT: &::core::primitive::str = $crate::git_version::git_version! {
                 prefix = "",
                 fallback = "unknown",
                 args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 0eb48d6823b8..f1fc3a86fe4b 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -27,7 +27,6 @@ crc32c.workspace = true
 either.workspace = true
 fail.workspace = true
 futures.workspace = true
-git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml
index 52b58fc298ce..d4f89ac38a19 100644
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -12,7 +12,6 @@ anyhow.workspace = true
 async-stream.workspace = true
 clap = { workspace = true, features = ["string"] }
 futures.workspace = true
-git-version.workspace = true
 itertools.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 9592002de131..a753f806a076 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -10,7 +10,6 @@ license.workspace = true
 anyhow.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
-git-version.workspace = true
 humantime.workspace = true
 pageserver = { path = ".." }
 pageserver_api.workspace = true
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 6703eb06eb29..501ce050e071 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -29,7 +29,6 @@ dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
 futures.workspace = true
-git-version.workspace = true
 hashbrown.workspace = true
 hashlink.workspace = true
 hex.workspace = true
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index daf21c70b045..67f32b3cc08b 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -21,7 +21,6 @@ chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
 crc32c.workspace = true
 fail.workspace = true
-git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 hyper.workspace = true
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index 82ec0aa272e3..5359f586e49d 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -15,7 +15,6 @@ const_format.workspace = true
 futures.workspace = true
 futures-core.workspace = true
 futures-util.workspace = true
-git-version.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
 once_cell.workspace = true
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index a96d64e09670..9ed0501026dc 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -20,7 +20,6 @@ chrono.workspace = true
 clap.workspace = true
 fail.workspace = true
 futures.workspace = true
-git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml
index f9987662b9f5..a1b5b0b12f19 100644
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 aws-sdk-s3.workspace = true
 either.workspace = true
 anyhow.workspace = true
-git-version.workspace = true
 hex.workspace = true
 humantime.workspace = true
 serde.workspace = true

From 2cf47b1477d281a868deab4914aceb53e37a22e9 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 25 Sep 2024 14:31:04 +0100
Subject: [PATCH 64/77] storcon: do az aware scheduling (#9083)

## Problem

Storage controller didn't previously consider AZ locality between
compute and pageservers
when scheduling nodes. Control plane has this feature, and, since we are
migrating tenants
away from it, we need feature parity to avoid perf degradations.

## Summary of changes

The change itself is fairly simple:
1. Thread az info into the scheduler
2. Add an extra member to the scheduling scores

Step (2) deserves some more discussion. Let's break it down by the shard
type being scheduled:

**Attached Shards**

We wish for attached shards of a tenant to end up in the preferred AZ of
the tenant since that
is where the compute is like to be.

The AZ member for `NodeAttachmentSchedulingScore` has been placed
below the affinity score (so it's got the second biggest weight for
picking the node). The rationale for going
below the affinity score is to avoid having all shards of a single
tenant placed on the same node in 2 node
regions, since that would mean that one tenant can drive the general
workload of an entire pageserver.
I'm not 100% sure this is the right decision, so open to discussing
hoisting the AZ up to first place.

 **Secondary Shards**

We wish for secondary shards of a tenant to be scheduled in a different
AZ from the preferred one
for HA purposes.

The AZ member for `NodeSecondarySchedulingScore` has been placed first,
so nodes in different AZs
from the preferred one will always be considered first. On small
clusters, this can mean that all the secondaries
of a tenant are scheduled to the same pageserver, but secondaries don't
use up as many resources as the
attached location, so IMO the argument made for attached shards doesn't
hold.

Related: https://github.com/neondatabase/neon/issues/8848
---
 control_plane/storcon_cli/src/main.rs     |   6 +-
 libs/pageserver_api/src/controller_api.rs |  14 +-
 pageserver/src/control_plane_client.rs    |   6 +-
 storage_controller/src/node.rs            |  16 +-
 storage_controller/src/persistence.rs     |   7 +-
 storage_controller/src/scheduler.rs       | 230 ++++++++++++++++++++--
 storage_controller/src/service.rs         |  16 +-
 storage_controller/src/tenant_shard.rs    | 215 +++++++++++++++++---
 8 files changed, 445 insertions(+), 65 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 651fcda8db52..73d89699edb6 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
-        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
+        ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_pg_port,
                         listen_http_addr,
                         listen_http_port,
-                        availability_zone_id,
+                        availability_zone_id: AvailabilityZone(availability_zone_id),
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 40b7dbbbc2af..0ea30ce54f78 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,4 +1,5 @@
 use std::collections::{HashMap, HashSet};
+use std::fmt::Display;
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -57,7 +58,7 @@ pub struct NodeRegisterRequest {
     pub listen_http_addr: String,
     pub listen_http_port: u16,
 
-    pub availability_zone_id: String,
+    pub availability_zone_id: AvailabilityZone,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -74,10 +75,19 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct AvailabilityZone(pub String);
+
+impl Display for AvailabilityZone {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct ShardsPreferredAzsRequest {
     #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, String>,
+    pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs
index f6d1c35a8ce1..d0a967b9207e 100644
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 
 use futures::Future;
 use pageserver_api::{
-    controller_api::NodeRegisterRequest,
+    controller_api::{AvailabilityZone, NodeRegisterRequest},
     shard::TenantShardId,
     upcall_api::{
         ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
@@ -148,10 +148,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                             .and_then(|jv| jv.as_str().map(|str| str.to_owned()));
 
                         match az_id_from_metadata {
-                            Some(az_id) => Some(az_id),
+                            Some(az_id) => Some(AvailabilityZone(az_id)),
                             None => {
                                 tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
-                                conf.availability_zone.clone()
+                                conf.availability_zone.clone().map(AvailabilityZone)
                             }
                         }
                     };
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index cb9ce10d230a..4cc9b0070dc7 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -2,8 +2,8 @@ use std::{str::FromStr, time::Duration};
 
 use pageserver_api::{
     controller_api::{
-        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest,
+        NodeSchedulingPolicy, TenantLocateResponseShard,
     },
     shard::TenantShardId,
 };
@@ -36,7 +36,7 @@ pub(crate) struct Node {
     listen_pg_addr: String,
     listen_pg_port: u16,
 
-    availability_zone_id: String,
+    availability_zone_id: AvailabilityZone,
 
     // This cancellation token means "stop any RPCs in flight to this node, and don't start
     // any more". It is not related to process shutdown.
@@ -64,8 +64,8 @@ impl Node {
     }
 
     #[allow(unused)]
-    pub(crate) fn get_availability_zone_id(&self) -> &str {
-        self.availability_zone_id.as_str()
+    pub(crate) fn get_availability_zone_id(&self) -> &AvailabilityZone {
+        &self.availability_zone_id
     }
 
     pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
@@ -181,7 +181,7 @@ impl Node {
         listen_http_port: u16,
         listen_pg_addr: String,
         listen_pg_port: u16,
-        availability_zone_id: String,
+        availability_zone_id: AvailabilityZone,
     ) -> Self {
         Self {
             id,
@@ -204,7 +204,7 @@ impl Node {
             listen_http_port: self.listen_http_port as i32,
             listen_pg_addr: self.listen_pg_addr.clone(),
             listen_pg_port: self.listen_pg_port as i32,
-            availability_zone_id: self.availability_zone_id.clone(),
+            availability_zone_id: self.availability_zone_id.0.clone(),
         }
     }
 
@@ -219,7 +219,7 @@ impl Node {
             listen_http_port: np.listen_http_port as u16,
             listen_pg_addr: np.listen_pg_addr,
             listen_pg_port: np.listen_pg_port as u16,
-            availability_zone_id: np.availability_zone_id,
+            availability_zone_id: AvailabilityZone(np.availability_zone_id),
             cancel: CancellationToken::new(),
         }
     }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 1dc1040d9637..14cc51240d10 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -9,6 +9,7 @@ use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
 use itertools::Itertools;
+use pageserver_api::controller_api::AvailabilityZone;
 use pageserver_api::controller_api::MetadataHealthRecord;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
@@ -667,8 +668,8 @@ impl Persistence {
 
     pub(crate) async fn set_tenant_shard_preferred_azs(
         &self,
-        preferred_azs: Vec<(TenantShardId, String)>,
-    ) -> DatabaseResult<Vec<(TenantShardId, String)>> {
+        preferred_azs: Vec<(TenantShardId, AvailabilityZone)>,
+    ) -> DatabaseResult<Vec<(TenantShardId, AvailabilityZone)>> {
         use crate::schema::tenant_shards::dsl::*;
 
         self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
@@ -679,7 +680,7 @@ impl Persistence {
                     .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                     .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
                     .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .set(preferred_az_id.eq(preferred_az))
+                    .set(preferred_az_id.eq(preferred_az.0.clone()))
                     .execute(conn)?;
 
                 if updated == 1 {
diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 1cb1fb104d60..2414d95eb89b 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,6 +1,6 @@
 use crate::{node::Node, tenant_shard::TenantShard};
 use itertools::Itertools;
-use pageserver_api::models::PageserverUtilization;
+use pageserver_api::{controller_api::AvailabilityZone, models::PageserverUtilization};
 use serde::Serialize;
 use std::{collections::HashMap, fmt::Debug};
 use utils::{http::error::ApiError, id::NodeId};
@@ -32,6 +32,8 @@ pub(crate) struct SchedulerNode {
     shard_count: usize,
     /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`].
     attached_shard_count: usize,
+    /// Availability zone id in which the node resides
+    az: AvailabilityZone,
 
     /// Whether this node is currently elegible to have new shards scheduled (this is derived
     /// from a node's availability state and scheduling policy).
@@ -42,6 +44,7 @@ pub(crate) trait NodeSchedulingScore: Debug + Ord + Copy + Sized {
     fn generate(
         node_id: &NodeId,
         node: &mut SchedulerNode,
+        preferred_az: &Option<AvailabilityZone>,
         context: &ScheduleContext,
     ) -> Option<Self>;
     fn is_overloaded(&self) -> bool;
@@ -62,6 +65,72 @@ impl ShardTag for SecondaryShardTag {
     type Score = NodeSecondarySchedulingScore;
 }
 
+#[derive(PartialEq, Eq, Debug, Clone, Copy)]
+enum AzMatch {
+    Yes,
+    No,
+    Unknown,
+}
+
+impl AzMatch {
+    fn new(node_az: &AvailabilityZone, shard_preferred_az: Option<&AvailabilityZone>) -> Self {
+        match shard_preferred_az {
+            Some(preferred_az) if preferred_az == node_az => Self::Yes,
+            Some(_preferred_az) => Self::No,
+            None => Self::Unknown,
+        }
+    }
+}
+
+#[derive(PartialEq, Eq, Debug, Clone, Copy)]
+struct AttachmentAzMatch(AzMatch);
+
+impl Ord for AttachmentAzMatch {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // Lower scores indicate a more suitable node.
+        // Note that we prefer a node for which we don't have
+        // info to a node which we are certain doesn't match the
+        // preferred AZ of the shard.
+        let az_match_score = |az_match: &AzMatch| match az_match {
+            AzMatch::Yes => 0,
+            AzMatch::Unknown => 1,
+            AzMatch::No => 2,
+        };
+
+        az_match_score(&self.0).cmp(&az_match_score(&other.0))
+    }
+}
+
+impl PartialOrd for AttachmentAzMatch {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+#[derive(PartialEq, Eq, Debug, Clone, Copy)]
+struct SecondaryAzMatch(AzMatch);
+
+impl Ord for SecondaryAzMatch {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // Lower scores indicate a more suitable node.
+        // For secondary locations we wish to avoid the preferred AZ
+        // of the shard.
+        let az_match_score = |az_match: &AzMatch| match az_match {
+            AzMatch::No => 0,
+            AzMatch::Unknown => 1,
+            AzMatch::Yes => 2,
+        };
+
+        az_match_score(&self.0).cmp(&az_match_score(&other.0))
+    }
+}
+
+impl PartialOrd for SecondaryAzMatch {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
 /// Scheduling score of a given node for shard attachments.
 /// Lower scores indicate more suitable nodes.
 /// Ordering is given by member declaration order (top to bottom).
@@ -70,6 +139,10 @@ pub(crate) struct NodeAttachmentSchedulingScore {
     /// The number of shards belonging to the tenant currently being
     /// scheduled that are attached to this node.
     affinity_score: AffinityScore,
+    /// Flag indicating whether this node matches the preferred AZ
+    /// of the shard. For equal affinity scores, nodes in the matching AZ
+    /// are considered first.
+    az_match: AttachmentAzMatch,
     /// Size of [`ScheduleContext::attached_nodes`] for the current node.
     /// This normally tracks the number of attached shards belonging to the
     /// tenant being scheduled that are already on this node.
@@ -87,6 +160,7 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
     fn generate(
         node_id: &NodeId,
         node: &mut SchedulerNode,
+        preferred_az: &Option<AvailabilityZone>,
         context: &ScheduleContext,
     ) -> Option<Self> {
         let utilization = match &mut node.may_schedule {
@@ -102,6 +176,7 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
                 .get(node_id)
                 .copied()
                 .unwrap_or(AffinityScore::FREE),
+            az_match: AttachmentAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
             attached_shards_in_context: context.attached_nodes.get(node_id).copied().unwrap_or(0),
             utilization_score: utilization.cached_score(),
             total_attached_shard_count: node.attached_shard_count,
@@ -123,6 +198,11 @@ impl NodeSchedulingScore for NodeAttachmentSchedulingScore {
 /// Ordering is given by member declaration order (top to bottom).
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
 pub(crate) struct NodeSecondarySchedulingScore {
+    /// Flag indicating whether this node matches the preferred AZ
+    /// of the shard. For secondary locations we wish to avoid nodes in.
+    /// the preferred AZ of the shard, since that's where the attached location
+    /// should be scheduled and having the secondary in the same AZ is bad for HA.
+    az_match: SecondaryAzMatch,
     /// The number of shards belonging to the tenant currently being
     /// scheduled that are attached to this node.
     affinity_score: AffinityScore,
@@ -139,6 +219,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
     fn generate(
         node_id: &NodeId,
         node: &mut SchedulerNode,
+        preferred_az: &Option<AvailabilityZone>,
         context: &ScheduleContext,
     ) -> Option<Self> {
         let utilization = match &mut node.may_schedule {
@@ -149,6 +230,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
         };
 
         Some(Self {
+            az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
             affinity_score: context
                 .nodes
                 .get(node_id)
@@ -179,6 +261,7 @@ impl PartialEq for SchedulerNode {
         may_schedule_matches
             && self.shard_count == other.shard_count
             && self.attached_shard_count == other.attached_shard_count
+            && self.az == other.az
     }
 }
 
@@ -293,6 +376,7 @@ impl Scheduler {
                     shard_count: 0,
                     attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
+                    az: node.get_availability_zone_id().clone(),
                 },
             );
         }
@@ -319,6 +403,7 @@ impl Scheduler {
                     shard_count: 0,
                     attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
+                    az: node.get_availability_zone_id().clone(),
                 },
             );
         }
@@ -497,6 +582,7 @@ impl Scheduler {
                     shard_count: 0,
                     attached_shard_count: 0,
                     may_schedule: node.may_schedule(),
+                    az: node.get_availability_zone_id().clone(),
                 });
             }
         }
@@ -542,6 +628,7 @@ impl Scheduler {
     fn compute_node_scores<Score>(
         &mut self,
         hard_exclude: &[NodeId],
+        preferred_az: &Option<AvailabilityZone>,
         context: &ScheduleContext,
     ) -> Vec<Score>
     where
@@ -553,7 +640,7 @@ impl Scheduler {
                 if hard_exclude.contains(k) {
                     None
                 } else {
-                    Score::generate(k, v, context)
+                    Score::generate(k, v, preferred_az, context)
                 }
             })
             .collect()
@@ -571,13 +658,15 @@ impl Scheduler {
     pub(crate) fn schedule_shard<Tag: ShardTag>(
         &mut self,
         hard_exclude: &[NodeId],
+        preferred_az: &Option<AvailabilityZone>,
         context: &ScheduleContext,
     ) -> Result<NodeId, ScheduleError> {
         if self.nodes.is_empty() {
             return Err(ScheduleError::NoPageservers);
         }
 
-        let mut scores = self.compute_node_scores::<Tag::Score>(hard_exclude, context);
+        let mut scores =
+            self.compute_node_scores::<Tag::Score>(hard_exclude, preferred_az, context);
 
         // Exclude nodes whose utilization is critically high, if there are alternatives available.  This will
         // cause us to violate affinity rules if it is necessary to avoid critically overloading nodes: for example
@@ -634,6 +723,12 @@ impl Scheduler {
         Ok(node_id)
     }
 
+    /// Selects any available node. This is suitable for performing background work (e.g. S3
+    /// deletions).
+    pub(crate) fn any_available_node(&mut self) -> Result<NodeId, ScheduleError> {
+        self.schedule_shard::<AttachedShardTag>(&[], &None, &ScheduleContext::default())
+    }
+
     /// Unit test access to internal state
     #[cfg(test)]
     pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
@@ -650,13 +745,22 @@ impl Scheduler {
 pub(crate) mod test_utils {
 
     use crate::node::Node;
-    use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
+    use pageserver_api::{
+        controller_api::{AvailabilityZone, NodeAvailability},
+        models::utilization::test_utilization,
+    };
     use std::collections::HashMap;
     use utils::id::NodeId;
+
     /// Test helper: synthesize the requested number of nodes, all in active state.
     ///
     /// Node IDs start at one.
-    pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
+    ///
+    /// The `azs` argument specifies the list of availability zones which will be assigned
+    /// to nodes in round-robin fashion. If empy, a default AZ is assigned.
+    pub(crate) fn make_test_nodes(n: u64, azs: &[AvailabilityZone]) -> HashMap<NodeId, Node> {
+        let mut az_iter = azs.iter().cycle();
+
         (1..n + 1)
             .map(|i| {
                 (NodeId(i), {
@@ -666,7 +770,10 @@ pub(crate) mod test_utils {
                         80 + i as u16,
                         format!("pghost-{i}"),
                         5432 + i as u16,
-                        "test-az".to_string(),
+                        az_iter
+                            .next()
+                            .cloned()
+                            .unwrap_or(AvailabilityZone("test-az".to_string())),
                     );
                     node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
                     assert!(node.is_available());
@@ -686,7 +793,7 @@ mod tests {
     use crate::tenant_shard::IntentState;
     #[test]
     fn scheduler_basic() -> anyhow::Result<()> {
-        let nodes = test_utils::make_test_nodes(2);
+        let nodes = test_utils::make_test_nodes(2, &[]);
 
         let mut scheduler = Scheduler::new(nodes.values());
         let mut t1_intent = IntentState::new();
@@ -694,9 +801,9 @@ mod tests {
 
         let context = ScheduleContext::default();
 
-        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &None, &context)?;
         t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(&[], &None, &context)?;
         t2_intent.set_attached(&mut scheduler, Some(scheduled));
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
@@ -705,8 +812,11 @@ mod tests {
         assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1);
         assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1);
 
-        let scheduled =
-            scheduler.schedule_shard::<AttachedShardTag>(&t1_intent.all_pageservers(), &context)?;
+        let scheduled = scheduler.schedule_shard::<AttachedShardTag>(
+            &t1_intent.all_pageservers(),
+            &None,
+            &context,
+        )?;
         t1_intent.push_secondary(&mut scheduler, scheduled);
 
         assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1);
@@ -746,7 +856,7 @@ mod tests {
     #[test]
     /// Test the PageserverUtilization's contribution to scheduling algorithm
     fn scheduler_utilization() {
-        let mut nodes = test_utils::make_test_nodes(3);
+        let mut nodes = test_utils::make_test_nodes(3, &[]);
         let mut scheduler = Scheduler::new(nodes.values());
 
         // Need to keep these alive because they contribute to shard counts via RAII
@@ -761,7 +871,7 @@ mod tests {
             context: &ScheduleContext,
         ) {
             let scheduled = scheduler
-                .schedule_shard::<AttachedShardTag>(&[], context)
+                .schedule_shard::<AttachedShardTag>(&[], &None, context)
                 .unwrap();
             let mut intent = IntentState::new();
             intent.set_attached(scheduler, Some(scheduled));
@@ -870,4 +980,98 @@ mod tests {
             intent.clear(&mut scheduler);
         }
     }
+
+    #[test]
+    /// A simple test that showcases AZ-aware scheduling and its interaction with
+    /// affinity scores.
+    fn az_scheduling() {
+        let az_a_tag = AvailabilityZone("az-a".to_string());
+        let az_b_tag = AvailabilityZone("az-b".to_string());
+
+        let nodes = test_utils::make_test_nodes(3, &[az_a_tag.clone(), az_b_tag.clone()]);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        // Need to keep these alive because they contribute to shard counts via RAII
+        let mut scheduled_intents = Vec::new();
+
+        let mut context = ScheduleContext::default();
+
+        fn assert_scheduler_chooses<Tag: ShardTag>(
+            expect_node: NodeId,
+            preferred_az: Option<AvailabilityZone>,
+            scheduled_intents: &mut Vec<IntentState>,
+            scheduler: &mut Scheduler,
+            context: &mut ScheduleContext,
+        ) {
+            let scheduled = scheduler
+                .schedule_shard::<Tag>(&[], &preferred_az, context)
+                .unwrap();
+            let mut intent = IntentState::new();
+            intent.set_attached(scheduler, Some(scheduled));
+            scheduled_intents.push(intent);
+            assert_eq!(scheduled, expect_node);
+
+            context.avoid(&[scheduled]);
+        }
+
+        assert_scheduler_chooses::<AttachedShardTag>(
+            NodeId(1),
+            Some(az_a_tag.clone()),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &mut context,
+        );
+
+        // Node 2 and 3 have affinity score equal to 0, but node 3
+        // is in "az-a" so we prefer that.
+        assert_scheduler_chooses::<AttachedShardTag>(
+            NodeId(3),
+            Some(az_a_tag.clone()),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &mut context,
+        );
+
+        // Node 2 is not in "az-a", but it has the lowest affinity so we prefer that.
+        assert_scheduler_chooses::<AttachedShardTag>(
+            NodeId(2),
+            Some(az_a_tag.clone()),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &mut context,
+        );
+
+        // Avoid nodes in "az-a" for the secondary location.
+        assert_scheduler_chooses::<SecondaryShardTag>(
+            NodeId(2),
+            Some(az_a_tag.clone()),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &mut context,
+        );
+
+        // Avoid nodes in "az-b" for the secondary location.
+        // Nodes 1 and 3 are identically loaded, so prefer the lowest node id.
+        assert_scheduler_chooses::<SecondaryShardTag>(
+            NodeId(1),
+            Some(az_b_tag.clone()),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &mut context,
+        );
+
+        // Avoid nodes in "az-b" for the secondary location.
+        // Node 3 has lower affinity score than 1, so prefer that.
+        assert_scheduler_chooses::<SecondaryShardTag>(
+            NodeId(3),
+            Some(az_b_tag.clone()),
+            &mut scheduled_intents,
+            &mut scheduler,
+            &mut context,
+        );
+
+        for mut intent in scheduled_intents {
+            intent.clear(&mut scheduler);
+        }
+    }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 5555505b81d9..6a11e9650ce9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -26,7 +26,7 @@ use crate::{
         ShardGenerationState, TenantFilter,
     },
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
-    scheduler::{AttachedShardTag, MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
         MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
         ScheduleOptimizationAction,
@@ -1265,6 +1265,8 @@ impl Service {
 
         #[cfg(feature = "testing")]
         {
+            use pageserver_api::controller_api::AvailabilityZone;
+
             // Hack: insert scheduler state for all nodes referenced by shards, as compatibility
             // tests only store the shards, not the nodes.  The nodes will be loaded shortly
             // after when pageservers start up and register.
@@ -1282,7 +1284,7 @@ impl Service {
                     123,
                     "".to_string(),
                     123,
-                    "test_az".to_string(),
+                    AvailabilityZone("test_az".to_string()),
                 );
 
                 scheduler.node_upsert(&node);
@@ -2099,7 +2101,7 @@ impl Service {
                     let az_id = locked
                         .nodes
                         .get(&resp.node_id)
-                        .map(|n| n.get_availability_zone_id().to_string())?;
+                        .map(|n| n.get_availability_zone_id().clone())?;
 
                     Some((resp.shard_id, az_id))
                 })
@@ -2629,8 +2631,7 @@ impl Service {
             let scheduler = &mut locked.scheduler;
             // Right now we only perform the operation on a single node without parallelization
             // TODO fan out the operation to multiple nodes for better performance
-            let node_id =
-                scheduler.schedule_shard::<AttachedShardTag>(&[], &ScheduleContext::default())?;
+            let node_id = scheduler.any_available_node()?;
             let node = locked
                 .nodes
                 .get(&node_id)
@@ -2816,8 +2817,7 @@ impl Service {
 
             // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant
             // was attached, just has to be able to see the S3 content)
-            let node_id =
-                scheduler.schedule_shard::<AttachedShardTag>(&[], &ScheduleContext::default())?;
+            let node_id = scheduler.any_available_node()?;
             let node = nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while lock is active");
@@ -4481,7 +4481,7 @@ impl Service {
                     let az_id = locked
                         .nodes
                         .get(node_id)
-                        .map(|n| n.get_availability_zone_id().to_string())?;
+                        .map(|n| n.get_availability_zone_id().clone())?;
 
                     Some((*tid, az_id))
                 })
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index eccde0e3ab60..afc89eae0073 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -15,7 +15,7 @@ use crate::{
     service::ReconcileResultRequest,
 };
 use pageserver_api::controller_api::{
-    NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
+    AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
 };
 use pageserver_api::{
     models::{LocationConfig, LocationConfigMode, TenantConfig},
@@ -146,7 +146,7 @@ pub(crate) struct TenantShard {
 
     // We should attempt to schedule this shard in the provided AZ to
     // decrease chances of cross-AZ compute.
-    preferred_az_id: Option<String>,
+    preferred_az_id: Option<AvailabilityZone>,
 }
 
 #[derive(Default, Clone, Debug, Serialize)]
@@ -540,8 +540,11 @@ impl TenantShard {
             Ok((true, promote_secondary))
         } else {
             // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id =
-                scheduler.schedule_shard::<AttachedShardTag>(&self.intent.secondary, context)?;
+            let node_id = scheduler.schedule_shard::<AttachedShardTag>(
+                &self.intent.secondary,
+                &self.preferred_az_id,
+                context,
+            )?;
             tracing::debug!("Selected {} as attached", node_id);
             self.intent.set_attached(scheduler, Some(node_id));
             Ok((true, node_id))
@@ -622,8 +625,11 @@ impl TenantShard {
 
                 let mut used_pageservers = vec![attached_node_id];
                 while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler
-                        .schedule_shard::<SecondaryShardTag>(&used_pageservers, context)?;
+                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
+                        &used_pageservers,
+                        &self.preferred_az_id,
+                        context,
+                    )?;
                     self.intent.push_secondary(scheduler, node_id);
                     used_pageservers.push(node_id);
                     modified = true;
@@ -636,7 +642,11 @@ impl TenantShard {
                     modified = true;
                 } else if self.intent.secondary.is_empty() {
                     // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(&[], context)?;
+                    let node_id = scheduler.schedule_shard::<SecondaryShardTag>(
+                        &[],
+                        &self.preferred_az_id,
+                        context,
+                    )?;
                     self.intent.push_secondary(scheduler, node_id);
                     modified = true;
                 }
@@ -815,6 +825,7 @@ impl TenantShard {
             // with lower utilization.
             let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
                 &self.intent.all_pageservers(),
+                &self.preferred_az_id,
                 schedule_context,
             ) else {
                 // A scheduling error means we have no possible candidate replacements
@@ -1313,7 +1324,7 @@ impl TenantShard {
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-            preferred_az_id: tsp.preferred_az_id,
+            preferred_az_id: tsp.preferred_az_id.map(AvailabilityZone),
         })
     }
 
@@ -1329,15 +1340,15 @@ impl TenantShard {
             config: serde_json::to_string(&self.config).unwrap(),
             splitting: SplitState::default(),
             scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
-            preferred_az_id: self.preferred_az_id.clone(),
+            preferred_az_id: self.preferred_az_id.as_ref().map(|az| az.0.clone()),
         }
     }
 
-    pub(crate) fn preferred_az(&self) -> Option<&str> {
-        self.preferred_az_id.as_deref()
+    pub(crate) fn preferred_az(&self) -> Option<&AvailabilityZone> {
+        self.preferred_az_id.as_ref()
     }
 
-    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: String) {
+    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
         self.preferred_az_id = Some(preferred_az_id);
     }
 }
@@ -1350,6 +1361,7 @@ pub(crate) mod tests {
         controller_api::NodeAvailability,
         shard::{ShardCount, ShardNumber},
     };
+    use rand::{rngs::StdRng, SeedableRng};
     use utils::id::TenantId;
 
     use crate::scheduler::test_utils::make_test_nodes;
@@ -1378,7 +1390,11 @@ pub(crate) mod tests {
         )
     }
 
-    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
+    fn make_test_tenant(
+        policy: PlacementPolicy,
+        shard_count: ShardCount,
+        preferred_az: Option<AvailabilityZone>,
+    ) -> Vec<TenantShard> {
         let tenant_id = TenantId::generate();
 
         (0..shard_count.count())
@@ -1390,7 +1406,7 @@ pub(crate) mod tests {
                     shard_number,
                     shard_count,
                 };
-                TenantShard::new(
+                let mut ts = TenantShard::new(
                     tenant_shard_id,
                     ShardIdentity::new(
                         shard_number,
@@ -1399,7 +1415,13 @@ pub(crate) mod tests {
                     )
                     .unwrap(),
                     policy.clone(),
-                )
+                );
+
+                if let Some(az) = &preferred_az {
+                    ts.set_preferred_az(az.clone());
+                }
+
+                ts
             })
             .collect()
     }
@@ -1410,7 +1432,7 @@ pub(crate) mod tests {
     fn tenant_ha_scheduling() -> anyhow::Result<()> {
         // Start with three nodes.  Our tenant will only use two.  The third one is
         // expected to remain unused.
-        let mut nodes = make_test_nodes(3);
+        let mut nodes = make_test_nodes(3, &[]);
 
         let mut scheduler = Scheduler::new(nodes.values());
         let mut context = ScheduleContext::default();
@@ -1462,7 +1484,7 @@ pub(crate) mod tests {
 
     #[test]
     fn intent_from_observed() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
+        let nodes = make_test_nodes(3, &[]);
         let mut scheduler = Scheduler::new(nodes.values());
 
         let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
@@ -1512,7 +1534,7 @@ pub(crate) mod tests {
 
     #[test]
     fn scheduling_mode() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
+        let nodes = make_test_nodes(3, &[]);
         let mut scheduler = Scheduler::new(nodes.values());
 
         let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
@@ -1537,7 +1559,7 @@ pub(crate) mod tests {
 
     #[test]
     fn optimize_attachment() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
+        let nodes = make_test_nodes(3, &[]);
         let mut scheduler = Scheduler::new(nodes.values());
 
         let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
@@ -1604,7 +1626,7 @@ pub(crate) mod tests {
 
     #[test]
     fn optimize_secondary() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
+        let nodes = make_test_nodes(4, &[]);
         let mut scheduler = Scheduler::new(nodes.values());
 
         let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
@@ -1703,14 +1725,14 @@ pub(crate) mod tests {
     /// that it converges.
     #[test]
     fn optimize_add_nodes() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
+        let nodes = make_test_nodes(4, &[]);
 
         // Only show the scheduler a couple of nodes
         let mut scheduler = Scheduler::new([].iter());
         scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
         scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
 
-        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
         let mut schedule_context = ScheduleContext::default();
         for shard in &mut shards {
             assert!(shard
@@ -1759,16 +1781,16 @@ pub(crate) mod tests {
     fn initial_scheduling_is_optimal() -> anyhow::Result<()> {
         use itertools::Itertools;
 
-        let nodes = make_test_nodes(2);
+        let nodes = make_test_nodes(2, &[]);
 
         let mut scheduler = Scheduler::new([].iter());
         scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
         scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
 
-        let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let mut a = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
         let a_context = Rc::new(RefCell::new(ScheduleContext::default()));
 
-        let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
+        let mut b = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4), None);
         let b_context = Rc::new(RefCell::new(ScheduleContext::default()));
 
         let a_shards_with_context = a.iter_mut().map(|shard| (shard, a_context.clone()));
@@ -1793,4 +1815,147 @@ pub(crate) mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn random_az_shard_scheduling() -> anyhow::Result<()> {
+        use rand::seq::SliceRandom;
+
+        for seed in 0..50 {
+            eprintln!("Running test with seed {seed}");
+            let mut rng = StdRng::seed_from_u64(seed);
+
+            let az_a_tag = AvailabilityZone("az-a".to_string());
+            let az_b_tag = AvailabilityZone("az-b".to_string());
+            let azs = [az_a_tag, az_b_tag];
+            let nodes = make_test_nodes(4, &azs);
+            let mut shards_per_az: HashMap<AvailabilityZone, u32> = HashMap::new();
+
+            let mut scheduler = Scheduler::new([].iter());
+            for node in nodes.values() {
+                scheduler.node_upsert(node);
+            }
+
+            let mut shards = Vec::default();
+            let mut contexts = Vec::default();
+            let mut az_picker = azs.iter().cycle().cloned();
+            for i in 0..100 {
+                let az = az_picker.next().unwrap();
+                let shard_count = i % 4 + 1;
+                *shards_per_az.entry(az.clone()).or_default() += shard_count;
+
+                let tenant_shards = make_test_tenant(
+                    PlacementPolicy::Attached(1),
+                    ShardCount::new(shard_count.try_into().unwrap()),
+                    Some(az),
+                );
+                let context = Rc::new(RefCell::new(ScheduleContext::default()));
+
+                contexts.push(context.clone());
+                let with_ctx = tenant_shards
+                    .into_iter()
+                    .map(|shard| (shard, context.clone()));
+                for shard_with_ctx in with_ctx {
+                    shards.push(shard_with_ctx);
+                }
+            }
+
+            shards.shuffle(&mut rng);
+
+            #[derive(Default, Debug)]
+            struct NodeStats {
+                attachments: u32,
+                secondaries: u32,
+            }
+
+            let mut node_stats: HashMap<NodeId, NodeStats> = HashMap::default();
+            let mut attachments_in_wrong_az = 0;
+            let mut secondaries_in_wrong_az = 0;
+
+            for (shard, context) in &mut shards {
+                let context = &mut *context.borrow_mut();
+                shard.schedule(&mut scheduler, context).unwrap();
+
+                let attached_node = shard.intent.get_attached().unwrap();
+                let stats = node_stats.entry(attached_node).or_default();
+                stats.attachments += 1;
+
+                let secondary_node = *shard.intent.get_secondary().first().unwrap();
+                let stats = node_stats.entry(secondary_node).or_default();
+                stats.secondaries += 1;
+
+                let attached_node_az = nodes
+                    .get(&attached_node)
+                    .unwrap()
+                    .get_availability_zone_id();
+                let secondary_node_az = nodes
+                    .get(&secondary_node)
+                    .unwrap()
+                    .get_availability_zone_id();
+                let preferred_az = shard.preferred_az().unwrap();
+
+                if attached_node_az != preferred_az {
+                    eprintln!(
+                        "{} attachment was scheduled in AZ {} but preferred AZ {}",
+                        shard.tenant_shard_id, attached_node_az, preferred_az
+                    );
+                    attachments_in_wrong_az += 1;
+                }
+
+                if secondary_node_az == preferred_az {
+                    eprintln!(
+                        "{} secondary was scheduled in AZ {} which matches preference",
+                        shard.tenant_shard_id, attached_node_az
+                    );
+                    secondaries_in_wrong_az += 1;
+                }
+            }
+
+            let mut violations = Vec::default();
+
+            if attachments_in_wrong_az > 0 {
+                violations.push(format!(
+                    "{} attachments scheduled to the incorrect AZ",
+                    attachments_in_wrong_az
+                ));
+            }
+
+            if secondaries_in_wrong_az > 0 {
+                violations.push(format!(
+                    "{} secondaries scheduled to the incorrect AZ",
+                    secondaries_in_wrong_az
+                ));
+            }
+
+            eprintln!(
+                "attachments_in_wrong_az={} secondaries_in_wrong_az={}",
+                attachments_in_wrong_az, secondaries_in_wrong_az
+            );
+
+            for (node_id, stats) in &node_stats {
+                let node_az = nodes.get(node_id).unwrap().get_availability_zone_id();
+                let ideal_attachment_load = shards_per_az.get(node_az).unwrap() / 2;
+                let allowed_attachment_load =
+                    (ideal_attachment_load - 1)..(ideal_attachment_load + 2);
+
+                if !allowed_attachment_load.contains(&stats.attachments) {
+                    violations.push(format!(
+                        "Found {} attachments on node {}, but expected {}",
+                        stats.attachments, node_id, ideal_attachment_load
+                    ));
+                }
+
+                eprintln!(
+                    "{}: attachments={} secondaries={} ideal_attachment_load={}",
+                    node_id, stats.attachments, stats.secondaries, ideal_attachment_load
+                );
+            }
+
+            assert!(violations.is_empty(), "{violations:?}");
+
+            for (mut shard, _ctx) in shards {
+                shard.intent.clear(&mut scheduler);
+            }
+        }
+        Ok(())
+    }
 }

From 4b711caf5edb808a6bfbe69dc6a1cbe9a7ff70a6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 25 Sep 2024 14:56:39 +0100
Subject: [PATCH 65/77] storage controller: make proxying of GETs to
 pageservers more robust (#9065)

## Problem

These commits are split off from
https://github.com/neondatabase/neon/pull/8971/commits where I was
fixing this to make a better scale test pass -- Vlad also independently
recognized these issues with cloudbench in
https://github.com/neondatabase/neon/issues/9062.

1. The storage controller proxies GET requests to pageservers based on
their intent, not the ground truth of where they're really attached.
2. Proxied requests can race with scheduling to tenants, resulting in
404 responses if the request hits the wrong pageserver.

Closes: https://github.com/neondatabase/neon/issues/9062

## Summary of changes

1. If a shard has a running reconciler, then use the database
generation_pageserver to decide who to proxy the request to
2. If such a request gets a 404 response and its scheduled node has
changed since the request was dispatched.
---
 storage_controller/src/http.rs                | 23 ++++--
 storage_controller/src/reconciler.rs          |  4 +
 storage_controller/src/service.rs             | 74 ++++++++++++-----
 .../regress/test_storage_controller.py        | 82 +++++++++++++++++++
 4 files changed, 157 insertions(+), 26 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 95e4a469ac95..4dd8badd0391 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -515,7 +515,7 @@ async fn handle_tenant_timeline_passthrough(
     tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
 
     // Find the node that holds shard zero
-    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
+    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?;
 
     // Callers will always pass an unsharded tenant ID.  Before proxying, we must
     // rewrite this to a shard-aware shard zero ID.
@@ -545,10 +545,10 @@ async fn handle_tenant_timeline_passthrough(
     let _timer = latency.start_timer(labels.clone());
 
     let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
-    let resp = client.get_raw(path).await.map_err(|_e|
-        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
-        // if we can't successfully send a request to the pageserver, we aren't available.
-        ApiError::ShuttingDown)?;
+    let resp = client.get_raw(path).await.map_err(|e|
+        // We return 503 here because if we can't successfully send a request to the pageserver,
+        // either we aren't available or the pageserver is unavailable.
+        ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
 
     if !resp.status().is_success() {
         let error_counter = &METRICS_REGISTRY
@@ -557,6 +557,19 @@ async fn handle_tenant_timeline_passthrough(
         error_counter.inc(labels);
     }
 
+    // Transform 404 into 503 if we raced with a migration
+    if resp.status() == reqwest::StatusCode::NOT_FOUND {
+        // Look up node again: if we migrated it will be different
+        let (new_node, _tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?;
+        if new_node.get_id() != node.get_id() {
+            // Rather than retry here, send the client a 503 to prompt a retry: this matches
+            // the pageserver's use of 503, and all clients calling this API should retry on 503.
+            return Err(ApiError::ResourceUnavailable(
+                format!("Pageserver {node} returned 404, was migrated to {new_node}").into(),
+            ));
+        }
+    }
+
     // We have a reqest::Response, would like a http::Response
     let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
     for (k, v) in resp.headers() {
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 750bcd7c0138..93b1c80566bd 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -541,6 +541,8 @@ impl Reconciler {
             }
         }
 
+        pausable_failpoint!("reconciler-live-migrate-pre-generation-inc");
+
         // Increment generation before attaching to new pageserver
         self.generation = Some(
             self.persistence
@@ -617,6 +619,8 @@ impl Reconciler {
             },
         );
 
+        pausable_failpoint!("reconciler-live-migrate-post-detach");
+
         tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",);
         let dest_final_conf = build_location_config(
             &self.shard,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 6a11e9650ce9..a5e012968475 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3508,34 +3508,66 @@ impl Service {
 
     /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
     /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
-    pub(crate) fn tenant_shard0_node(
+    pub(crate) async fn tenant_shard0_node(
         &self,
         tenant_id: TenantId,
     ) -> Result<(Node, TenantShardId), ApiError> {
-        let locked = self.inner.read().unwrap();
-        let Some((tenant_shard_id, shard)) = locked
-            .tenants
-            .range(TenantShardId::tenant_range(tenant_id))
-            .next()
-        else {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
-            ));
+        // Look up in-memory state and maybe use the node from there.
+        {
+            let locked = self.inner.read().unwrap();
+            let Some((tenant_shard_id, shard)) = locked
+                .tenants
+                .range(TenantShardId::tenant_range(tenant_id))
+                .next()
+            else {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+                ));
+            };
+
+            let Some(intent_node_id) = shard.intent.get_attached() else {
+                tracing::warn!(
+                    tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                    "Shard not scheduled (policy {:?}), cannot generate pass-through URL",
+                    shard.policy
+                );
+                return Err(ApiError::Conflict(
+                    "Cannot call timeline API on non-attached tenant".to_string(),
+                ));
+            };
+
+            if shard.reconciler.is_none() {
+                // Optimization: while no reconcile is in flight, we may trust our in-memory state
+                // to tell us which pageserver to use. Otherwise we will fall through and hit the database
+                let Some(node) = locked.nodes.get(intent_node_id) else {
+                    // This should never happen
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "Shard refers to nonexistent node"
+                    )));
+                };
+                return Ok((node.clone(), *tenant_shard_id));
+            }
         };
 
-        // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
-        // point to somewhere we haven't attached yet.
-        let Some(node_id) = shard.intent.get_attached() else {
-            tracing::warn!(
-                tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                "Shard not scheduled (policy {:?}), cannot generate pass-through URL",
-                shard.policy
-            );
-            return Err(ApiError::Conflict(
-                "Cannot call timeline API on non-attached tenant".to_string(),
+        // Look up the latest attached pageserver location from the database
+        // generation state: this will reflect the progress of any ongoing migration.
+        // Note that it is not guaranteed to _stay_ here, our caller must still handle
+        // the case where they call through to the pageserver and get a 404.
+        let db_result = self.persistence.tenant_generations(tenant_id).await?;
+        let Some(ShardGenerationState {
+            tenant_shard_id,
+            generation: _,
+            generation_pageserver: Some(node_id),
+        }) = db_result.first()
+        else {
+            // This can happen if we raced with a tenant deletion or a shard split.  On a retry
+            // the caller will either succeed (shard split case), get a proper 404 (deletion case),
+            // or a conflict response (case where tenant was detached in background)
+            return Err(ApiError::ResourceUnavailable(
+                "Shard {} not found in database, or is not attached".into(),
             ));
         };
-
+        let locked = self.inner.read().unwrap();
         let Some(node) = locked.nodes.get(node_id) else {
             // This should never happen
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 4106efd4f9cc..3861f0b82274 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4,6 +4,7 @@
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
+from enum import Enum
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import pytest
@@ -2466,6 +2467,87 @@ def has_hit_migration_failpoint():
         raise
 
 
+class MigrationFailpoints(Enum):
+    # While only the origin is attached
+    PRE_GENERATION_INC = "reconciler-live-migrate-pre-generation-inc"
+    # While both locations are attached
+    POST_NOTIFY = "reconciler-live-migrate-post-notify"
+    # While only the destination is attached
+    POST_DETACH = "reconciler-live-migrate-post-detach"
+
+
+@pytest.mark.parametrize(
+    "migration_failpoint",
+    [
+        MigrationFailpoints.PRE_GENERATION_INC,
+        MigrationFailpoints.POST_NOTIFY,
+        MigrationFailpoints.POST_DETACH,
+    ],
+)
+def test_storage_controller_proxy_during_migration(
+    neon_env_builder: NeonEnvBuilder, migration_failpoint: MigrationFailpoints
+):
+    """
+    If we send a proxied GET request to the controller during a migration, it should route
+    the request to whichever pageserver was most recently issued a generation.
+
+    Reproducer for https://github.com/neondatabase/neon/issues/9062
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    env.neon_cli.create_tenant(tenant_id, timeline_id)
+
+    # Activate a failpoint that will cause live migration to get stuck _after_ the generation has been issued
+    # to the new pageserver: this should result in requests routed to the new pageserver.
+    env.storage_controller.configure_failpoints((migration_failpoint.value, "pause"))
+
+    origin_pageserver = env.get_tenant_pageserver(tenant_id)
+    dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
+
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            migrate_fut = executor.submit(
+                env.storage_controller.tenant_shard_migrate,
+                TenantShardId(tenant_id, 0, 0),
+                dest_ps_id,
+            )
+
+            def has_hit_migration_failpoint():
+                expr = f"at failpoint {str(migration_failpoint.value)}"
+                log.info(expr)
+                assert env.storage_controller.log_contains(expr)
+
+            wait_until(10, 1, has_hit_migration_failpoint)
+
+            # This request should be routed to whichever pageserver holds the highest generation
+            tenant_info = env.storage_controller.pageserver_api().tenant_status(
+                tenant_id,
+            )
+
+            if migration_failpoint in (
+                MigrationFailpoints.POST_NOTIFY,
+                MigrationFailpoints.POST_DETACH,
+            ):
+                # We expect request to land on the destination
+                assert tenant_info["generation"] == 2
+            elif migration_failpoint == MigrationFailpoints.PRE_GENERATION_INC:
+                # We expect request to land on the origin
+                assert tenant_info["generation"] == 1
+
+            # Eventually migration completes
+            env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
+            migrate_fut.result()
+    except:
+        # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
+        env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
+        raise
+
+
 @run_only_on_default_postgres("this is like a 'unit test' against storcon db")
 def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_configs()

From 518f598e2d7e056c711694592c50737136eb8f38 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 25 Sep 2024 16:24:09 +0200
Subject: [PATCH 66/77] docs(rfc): Independent compute release flow (#8881)

Related to https://github.com/neondatabase/cloud/issues/11698
---
 docs/rfcs/038-independent-compute-release.md | 343 +++++++++++++++++++
 1 file changed, 343 insertions(+)
 create mode 100644 docs/rfcs/038-independent-compute-release.md

diff --git a/docs/rfcs/038-independent-compute-release.md b/docs/rfcs/038-independent-compute-release.md
new file mode 100644
index 000000000000..3deaf1e6fdfb
--- /dev/null
+++ b/docs/rfcs/038-independent-compute-release.md
@@ -0,0 +1,343 @@
+# Independent compute release
+
+Created at: 2024-08-30. Author: Alexey Kondratov (@ololobus)
+
+## Summary
+
+This document proposes an approach to fully independent compute release flow. It attempts to
+cover the following features:
+
+- Process is automated as much as possible to minimize human errors.
+- Compute<->storage protocol compatibility is ensured.
+- A transparent release history is available with an easy rollback strategy.
+- Although not in the scope of this document, there is a viable way to extend the proposed release
+  flow to achieve the canary and/or blue-green deployment strategies.
+
+## Motivation
+
+Previously, the compute release was tightly coupled to the storage release. This meant that once
+some storage nodes got restarted with a newer version, all new compute starts using these nodes
+automatically got a new version. Thus, two releases happen in parallel, which increases the blast
+radius and makes ownership fuzzy.
+
+Now, we practice a manual v0 independent compute release flow -- after getting a new compute release
+image and tag, we pin it region by region using Admin UI. It's better, but it still has its own flaws:
+
+1. It's a simple but fairly manual process, as you need to click through a few pages.
+2. It's prone to human errors, e.g., you could mistype or copy the wrong compute tag.
+3. We now require an additional approval in the Admin UI, which partially solves the 2.,
+   but also makes the whole process pretty annoying, as you constantly need to go back
+   and forth between two people.
+
+## Non-goals
+
+It's not the goal of this document to propose a design for some general-purpose release tool like Helm.
+The document considers how the current compute fleet is orchestrated at Neon. Even if we later
+decide to split the control plane further (e.g., introduce a separate compute controller), the proposed
+release process shouldn't change much, i.e., the releases table and API will reside in
+one of the parts.
+
+Achieving the canary and/or blue-green deploy strategies is out of the scope of this document. They
+were kept in mind, though, so it's expected that the proposed approach will lay down the foundation
+for implementing them in future iterations.
+
+## Impacted components
+
+Compute, control plane, CI, observability (some Grafana dashboards may require changes).
+
+## Prior art
+
+One of the very close examples is how Helm tracks [releases history](https://helm.sh/docs/helm/helm_history/).
+
+In the code:
+
+- [Release](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/release.go#L20-L43)
+- [Release info](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/info.go#L24-L40)
+- [Release status](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/status.go#L18-L42)
+
+TL;DR it has several important attributes:
+
+- Revision -- unique release ID/primary key. It is not the same as the application version,
+  because the same version can be deployed several times, e.g., after a newer version rollback.
+- App version -- version of the application chart/code.
+- Config -- set of overrides to the default config of the application.
+- Status -- current status of the release in the history.
+- Timestamps -- tracks when a release was created and deployed.
+
+## Proposed implementation
+
+### Separate release branch
+
+We will use a separate release branch, `release-compute`, to have a clean history for releases and commits.
+In order to avoid confusion with storage releases, we will use a different prefix for compute [git release
+tags](https://github.com/neondatabase/neon/releases) -- `release-compute-XXXX`. We will use the same tag for
+Docker images as well. The `neondatabase/compute-node-v16:release-compute-XXXX` looks longer and a bit redundant,
+but it's better to have image and git tags in sync.
+
+Currently, control plane relies on the numeric compute and storage release versions to decide on compute->storage
+compatibility. Once we implement this proposal, we should drop this code as release numbers will be completely
+independent. The only constraint we want is that it must monotonically increase within the same release branch.
+
+### Compute config/settings manifest
+
+We will create a new sub-directory `compute` and file `compute/manifest.yaml` with a structure:
+
+```yaml
+pg_settings:
+  # Common settings for primaries and secondaries of all versions.
+  common:
+    wal_log_hints: "off"
+    max_wal_size: "1024"
+
+  per_version:
+    14:
+      # Common settings for both replica and primary of version PG 14
+      common:
+        shared_preload_libraries: "neon,pg_stat_statements,extension_x"
+    15:
+      common:
+        shared_preload_libraries: "neon,pg_stat_statements,extension_x"
+      # Settings that should be applied only to
+      replica:
+        # Available only starting Postgres 15th
+        recovery_prefetch: "off"
+    # ...
+    17:
+      common:
+        # For example, if third-party `extension_x` is not yet available for PG 17
+        shared_preload_libraries: "neon,pg_stat_statements"
+      replica:
+        recovery_prefetch: "off"
+```
+
+**N.B.** Setting value should be a string with `on|off` for booleans and a number (as a string)
+without units for all numeric settings. That's how the control plane currently operates.
+
+The priority of settings will be (a higher number is a higher priority):
+
+1. Any static and hard-coded settings in the control plane
+2. `pg_settings->common`
+3. Per-version `common`
+4. Per-version `replica`
+5. Any per-user/project/endpoint overrides in the control plane
+6. Any dynamic setting calculated based on the compute size
+
+**N.B.** For simplicity, we do not do any custom logic for `shared_preload_libraries`, so it's completely
+overridden if specified on some level. Make sure that you include all necessary extensions in it when you
+do any overrides.
+
+**N.B.** There is a tricky question about what to do with custom compute image pinning we sometimes
+do for particular projects and customers. That's usually some ad-hoc work and images are based on
+the latest compute image, so it's relatively safe to assume that we could use settings from the latest compute
+release. If for some reason that's not true, and further overrides are needed, it's also possible to do
+on the project level together with pinning the image, so it's on-call/engineer/support responsibility to
+ensure that compute starts with the specified custom image. The only real risk is that compute image will get
+stale and settings from new releases will drift away, so eventually it will get something incompatible,
+but i) this is some operational issue, as we do not want stale images anyway, and ii) base settings
+receive something really new so rarely that the chance of this happening is very low. If we want to solve it completely,
+then together with pinning the image we could also pin the matching release revision in the control plane.
+
+The compute team will own the content of `compute/manifest.yaml`.
+
+### Control plane: releases table
+
+In order to store information about releases, the control plane will use a table `compute_releases` with the following
+schema:
+
+```sql
+CREATE TABLE compute_releases (
+  -- Unique release ID
+  -- N.B. Revision won't by synchronized across all regions, because all control planes are technically independent
+  -- services. We have the same situation with Helm releases as well because they could be deployed and rolled back
+  -- independently in different clusters.
+  revision BIGSERIAL PRIMARY KEY,
+  -- Numeric version of the compute image, e.g. 9057
+  version BIGINT NOT NULL,
+  -- Compute image tag, e.g. `release-9057`
+  tag TEXT NOT NULL,
+  -- Current release status. Currently, it will be a simple enum
+  -- * `deployed` -- release is deployed and used for new compute starts.
+  --                 Exactly one release can have this status at a time.
+  -- * `superseded` -- release has been replaced by a newer one.
+  -- But we can always extend it in the future when we need more statuses
+  -- for more complex deployment strategies.
+  status TEXT NOT NULL,
+  -- Any additional metadata for compute in the corresponding release
+  manifest JSONB NOT NULL,
+  -- Timestamp when release record was created in the control plane database
+  created_at TIMESTAMP NOT NULL DEFAULT now(),
+  -- Timestamp when release deployment was finished
+  deployed_at TIMESTAMP
+);
+```
+
+We keep track of the old releases not only for the sake of audit, but also because we usually have ~30% of
+old computes started using the image from one of the previous releases. Yet, when users want to reconfigure
+them without restarting, the control plane needs to know what settings are applicable to them, so we also need
+information about the previous releases that are readily available. There could be some other auxiliary info
+needed as well: supported extensions, compute flags, etc.
+
+**N.B.** Here, we can end up in an ambiguous situation when the same compute image is deployed twice, e.g.,
+it was deployed once, then rolled back, and then deployed again, potentially with a different manifest. Yet,
+we could've started some computes with the first deployment and some with the second. Thus, when we need to
+look up the manifest for the compute by its image tag, we will see two records in the table with the same tag,
+but different revision numbers. We can assume that this could happen only in case of rollbacks, so we
+can just take the latest revision for the given tag.
+
+### Control plane: management API
+
+The control plane will implement new API methods to manage releases:
+
+1. `POST /management/api/v2/compute_releases` to create a new release. With payload
+
+   ```json
+    {
+      "version": 9057,
+      "tag": "release-9057",
+      "manifest": {}
+    }
+   ```
+
+   and response
+
+   ```json
+    {
+      "revision": 53,
+      "version": 9057,
+      "tag": "release-9057",
+      "status": "deployed",
+      "manifest": {},
+      "created_at": "2024-08-15T15:52:01.0000Z",
+      "deployed_at": "2024-08-15T15:52:01.0000Z",
+    }
+   ```
+
+   Here, we can actually mix-in custom (remote) extensions metadata into the `manifest`, so that the control plane
+   will get information about all available extensions not bundled into compute image. The corresponding
+   workflow in `neondatabase/build-custom-extensions` should produce it as an artifact and make
+   it accessible to the workflow in the `neondatabase/infra`. See the complete release flow below. Doing that,
+   we put a constraint that new custom extension requires new compute release, which is good for the safety,
+   but is not exactly what we want operational-wise (we want to be able to deploy new extensions without new
+   images). Yet, it can be solved incrementally: v0 -- do not do anything with extensions at all;
+   v1 -- put them into the same manifest; v2 -- make them separate entities with their own lifecycle.
+
+   **N.B.** This method is intended to be used in CI workflows, and CI/network can be flaky. It's reasonable
+   to assume that we could retry the request several times, even though it's already succeeded. Although it's
+   not a big deal to create several identical releases one-by-one, it's better to avoid it, so the control plane
+   should check if the latest release is identical and just return `304 Not Modified` in this case.
+
+2. `POST /management/api/v2/compute_releases/rollback` to rollback to any previously deployed release. With payload
+   including the revision of the release to rollback to:
+
+   ```json
+   {
+      "revision": 52
+   }
+   ```
+
+   Rollback marks the current release as `superseded` and creates a new release with all the same data as the
+   requested revision, but with a new revision number.
+
+   This rollback API is not strictly needed, as we can just use `infra` repo workflow to deploy any
+   available tag. It's still nice to have for on-call and any urgent matters, for example, if we need
+   to rollback and GitHub is down. It's much easier to specify only the revision number vs. crafting
+   all the necessary data for the new release payload.
+
+### Compute->storage compatibility tests
+
+In order to safely release new compute versions independently from storage, we need to ensure that the currently
+deployed storage is compatible with the new compute version. Currently, we maintain backward compatibility
+in storage, but newer computes may require a newer storage version.
+
+Remote end-to-end (e2e) tests [already accept](https://github.com/neondatabase/cloud/blob/e3468d433e0d73d02b7d7e738d027f509b522408/.github/workflows/testing.yml#L43-L48)
+`storage_image_tag` and `compute_image_tag` as separate inputs. That means that we could reuse e2e tests to ensure
+compatibility between storage and compute:
+
+1. Pick the latest storage release tag and use it as `storage_image_tag`.
+2. Pick a new compute tag built in the current compute release PR and use it as `compute_image_tag`.
+   Here, we should use a temporary ECR image tag, because the final tag will be known only after the release PR is merged.
+3. Trigger e2e tests as usual.
+
+### Release flow
+
+```mermaid
+  sequenceDiagram
+
+  actor oncall as Compute on-call person
+  participant neon as neondatabase/neon
+
+  box private
+    participant cloud as neondatabase/cloud
+    participant exts as neondatabase/build-custom-extensions
+    participant infra as neondatabase/infra
+  end
+
+  box cloud
+    participant preprod as Pre-prod control plane
+    participant prod as Production control plane
+    participant k8s as Compute k8s
+  end
+
+  oncall ->> neon: Open release PR into release-compute
+
+  activate neon
+  neon ->> cloud: CI: trigger e2e compatibility tests
+  activate cloud
+  cloud -->> neon: CI: e2e tests pass
+  deactivate cloud
+  neon ->> neon: CI: pass PR checks, get approvals
+  deactivate neon
+
+  oncall ->> neon: Merge release PR into release-compute
+
+  activate neon
+  neon ->> neon: CI: pass checks, build and push images
+  neon ->> exts: CI: trigger extensions build
+  activate exts
+  exts -->> neon: CI: extensions are ready
+  deactivate exts
+  neon ->> neon: CI: create release tag
+  neon ->> infra: Trigger release workflow using the produced tag
+  deactivate neon
+
+  activate infra
+  infra ->> infra: CI: pass checks
+  infra ->> preprod: Release new compute image to pre-prod automatically <br/> POST /management/api/v2/compute_releases
+  activate preprod
+  preprod -->> infra: 200 OK
+  deactivate preprod
+
+  infra ->> infra: CI: wait for per-region production deploy approvals
+  oncall ->> infra: CI: approve deploys region by region
+  infra ->> k8s: Prewarm new compute image
+  infra ->> prod: POST /management/api/v2/compute_releases
+  activate prod
+  prod -->> infra: 200 OK
+  deactivate prod
+  deactivate infra
+```
+
+## Further work
+
+As briefly mentioned in other sections, eventually, we would like to use more complex deployment strategies.
+For example, we can pass a fraction of the total compute starts that should use the new release. Then we can
+mark the release as `partial` or `canary` and monitor its performance. If everything is fine, we can promote it
+to `deployed` status. If not, we can roll back to the previous one.
+
+## Alternatives
+
+In theory, we can try using Helm as-is:
+
+1. Write a compute Helm chart. That will actually have only some config map, which the control plane can access and read.
+   N.B. We could reuse the control plane chart as well, but then it's not a fully independent release again and even more fuzzy.
+2. The control plane will read it and start using the new compute version for new starts.
+
+Drawbacks:
+
+1. Helm releases work best if the workload is controlled by the Helm chart itself. Then you can have different
+   deployment strategies like rolling update or canary or blue/green deployments. At Neon, the compute starts are controlled
+   by control plane, so it makes it much more tricky.
+2. Releases visibility will suffer, i.e. instead of a nice table in the control plane and Admin UI, we would need to use
+   `helm` cli and/or K8s UIs like K8sLens.
+3. We do not restart all computes shortly after the new version release. This means that for some features and compatibility
+   purpose (see above) control plane may need some auxiliary info from the previous releases.

From c4f5736d5a1077c4e6b4b26478006e874a789c57 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Wed, 25 Sep 2024 16:50:05 +0200
Subject: [PATCH 67/77] Build images for PG17 using Debian 12 "Bookworm"
 (#9132)

This increases the support window of the OS used for PG17 by 2 years
compared to the previous usage of Debian 11 "Bullseye".
---
 .github/workflows/build_and_test.yml    | 41 +++++++++++++++++-------
 .github/workflows/trigger-e2e-tests.yml |  2 +-
 compute/Dockerfile.compute-node         | 42 ++++++++++++++++++-------
 3 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a634edb96b87..9dcc9709eb4c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -602,7 +602,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: [ v14, v15, v16, v17 ]
+        version:
+          # Much data was already generated on old PG versions with bullseye's
+          # libraries, the locales of which can cause data incompatibilities.
+          # However, new PG versions should check if they can be built on newer
+          # images, as that reduces the support burden of old and ancient
+          # distros.
+          - pg: v14
+            debian: bullseye-slim
+          - pg: v15
+            debian: bullseye-slim
+          - pg: v16
+            debian: bullseye-slim
+          - pg: v17
+            debian: bookworm-slim
         arch: [ x64, arm64 ]
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -645,41 +658,46 @@ jobs:
           context: .
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            PG_VERSION=${{ matrix.version }}
+            PG_VERSION=${{ matrix.version.pg }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
             TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+            DEBIAN_FLAVOR=${{ matrix.version.debian }}
           provenance: false
           push: true
           pull: true
           file: compute/Dockerfile.compute-node
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
           tags: |
-            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Build neon extensions test image
-        if: matrix.version == 'v16'
+        if: matrix.version.pg == 'v16'
         uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            PG_VERSION=${{ matrix.version }}
+            PG_VERSION=${{ matrix.version.pg }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
             TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+            DEBIAN_FLAVOR=${{ matrix.version.debian }}
           provenance: false
           push: true
           pull: true
           file: compute/Dockerfile.compute-node
           target: neon-pg-ext-test
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
           tags: |
-            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
+            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
-        if: matrix.version == 'v17'
+        # We pick 16, because that builds on debian 11 with older glibc (and is
+        # thus compatible with newer glibc), rather than 17 on Debian 12, as
+        # that isn't guaranteed to be compatible with Debian 11
+        if: matrix.version.pg == 'v16'
         uses: docker/build-push-action@v6
         with:
           target: compute-tools-image
@@ -688,6 +706,7 @@ jobs:
             GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
             BUILD_TAG=${{ needs.tag.outputs.build-tag }}
             TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+            DEBIAN_FLAVOR=${{ matrix.version.debian }}
           provenance: false
           push: true
           pull: true
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index f25c1051cd98..cad97645327b 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -102,7 +102,7 @@ jobs:
           # Default set of platforms to run e2e tests on
           platforms='["docker", "k8s"]'
 
-          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
+          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or compute/Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
           # If the workflow run is not a pull request, add k8s-neonvm to the list.
           if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
             for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 18c68c116a94..2c647a669c28 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -3,13 +3,15 @@ ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
+ARG DEBIAN_FLAVOR=bullseye-slim
 
 #########################################################################################
 #
 # Layer "build-deps"
 #
 #########################################################################################
-FROM debian:bullseye-slim AS build-deps
+FROM debian:$DEBIAN_FLAVOR AS build-deps
+ARG DEBIAN_FLAVOR
 RUN apt update &&  \
     apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
@@ -1027,7 +1029,8 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
 #
 #########################################################################################
 
-FROM debian:bullseye-slim AS compute-tools-image
+FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
+ARG DEBIAN_FLAVOR
 
 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 
@@ -1037,7 +1040,8 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu
 #
 #########################################################################################
 
-FROM debian:bullseye-slim AS pgbouncer
+FROM debian:$DEBIAN_FLAVOR AS pgbouncer
+ARG DEBIAN_FLAVOR
 RUN set -e \
     && apt-get update \
     && apt-get install -y \
@@ -1179,7 +1183,9 @@ ENV PGDATABASE=postgres
 # Put it all together into the final image
 #
 #########################################################################################
-FROM debian:bullseye-slim
+FROM debian:$DEBIAN_FLAVOR
+ARG DEBIAN_FLAVOR
+ENV DEBIAN_FLAVOR=$DEBIAN_FLAVOR
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     echo "postgres:test_console_pass" | chpasswd && \
@@ -1211,21 +1217,34 @@ COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
 
-
 # Install:
 # libreadline8 for psql
-# libicu67, locales for collations (including ICU and plpgsql_check)
 # liblz4-1 for lz4
 # libossp-uuid16 for extension ossp-uuid
-# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
+# libgeos, libsfcgal1, and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
-RUN apt update &&  \
+
+
+RUN apt update && \
+    case $DEBIAN_FLAVOR in \
+      # Version-specific installs for Bullseye (PG14-PG16):
+      # libicu67, locales for collations (including ICU and plpgsql_check)
+      # libgdal28, libproj19 for PostGIS
+      bullseye*) \
+        VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \
+      ;; \
+      # Version-specific installs for Bookworm (PG17):
+      # libicu72, locales for collations (including ICU and plpgsql_check)
+      # libgdal32, libproj25 for PostGIS
+      bookworm*) \
+        VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \
+      ;; \
+    esac && \
     apt install --no-install-recommends -y \
         gdb \
-        libicu67 \
         liblz4-1 \
         libreadline8 \
         libboost-iostreams1.74.0 \
@@ -1234,8 +1253,6 @@ RUN apt update &&  \
         libboost-system1.74.0 \
         libossp-uuid16 \
         libgeos-c1v5 \
-        libgdal28 \
-        libproj19 \
         libprotobuf-c1 \
         libsfcgal1 \
         libxml2 \
@@ -1244,7 +1261,8 @@ RUN apt update &&  \
         libcurl4-openssl-dev \
         locales \
         procps \
-        ca-certificates && \
+        ca-certificates \
+        $VERSION_INSTALLS && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
 

From c5972389aafe51ea77b958f1d192d103d9ff7e6b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 25 Sep 2024 15:54:41 +0100
Subject: [PATCH 68/77] storcon: include timeline ID in LSN waiting logs
 (#9141)

## Problem
Hard to tell which timeline is holding the migration.

## Summary of Changes
Add timeline id to log.
---
 storage_controller/src/reconciler.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 93b1c80566bd..2c42da404355 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -463,7 +463,7 @@ impl Reconciler {
             for (timeline_id, baseline_lsn) in &baseline {
                 match latest.get(timeline_id) {
                     Some(latest_lsn) => {
-                        tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                        tracing::info!(timeline_id = %timeline_id, "🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
                         if latest_lsn < baseline_lsn {
                             any_behind = true;
                         }

From d447f49bc33c7207bd1659eccfc5d892300dd57a Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 25 Sep 2024 10:57:38 -0400
Subject: [PATCH 69/77] fix(pageserver): handle lsn lease requests for
 unnormalized lsns (#9137)

Fixes https://github.com/neondatabase/neon/issues/9098.

## Problem

See
https://github.com/neondatabase/neon/issues/9098#issuecomment-2372484969.

### Related

A similar problem happened with branch creation, which was discussed
[here](https://github.com/neondatabase/neon/pull/2143#issuecomment-1199969052)
and fixed by https://github.com/neondatabase/neon/pull/2529.

## Summary of changes

- Normalize the lsn on pageserver side upon lsn lease request, stores
the normalized LSN.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/src/tenant/timeline.rs         |  6 +++++-
 test_runner/regress/test_readonly_node.py | 15 +++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c98efd5f7184..d301ba23eafa 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -112,7 +112,7 @@ use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
 
 use postgres_connection::PgConnectionConfig;
-use postgres_ffi::to_pg_timestamp;
+use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE};
 use utils::{
     completion,
     generation::Generation,
@@ -1337,6 +1337,10 @@ impl Timeline {
         _ctx: &RequestContext,
     ) -> anyhow::Result<LsnLease> {
         let lease = {
+            // Normalize the requested LSN to be aligned, and move to the first record
+            // if it points to the beginning of the page (header).
+            let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
+
             let mut gc_info = self.gc_info.write().unwrap();
 
             let valid_until = SystemTime::now() + length;
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 347fc3a04ddb..5e8b8d38f7e2 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -122,6 +122,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
     Test static endpoint is protected from GC by acquiring and renewing lsn leases.
     """
 
+    LSN_LEASE_LENGTH = 8
     neon_env_builder.num_pageservers = 2
     # GC is manual triggered.
     env = neon_env_builder.init_start(
@@ -139,7 +140,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
             "image_creation_threshold": "1",
             "image_layer_creation_check_threshold": "0",
             # Short lease length to fit test.
-            "lsn_lease_length": "3s",
+            "lsn_lease_length": f"{LSN_LEASE_LENGTH}s",
         },
         initial_tenant_shard_count=2,
     )
@@ -170,10 +171,14 @@ def generate_updates_on_main(
     with env.endpoints.create_start("main") as ep_main:
         with ep_main.cursor() as cur:
             cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)")
-        lsn = None
+        lsn = Lsn(0)
         for i in range(2):
             lsn = generate_updates_on_main(env, ep_main, i)
 
+        # Round down to the closest LSN on page boundary (unnormalized).
+        XLOG_BLCKSZ = 8192
+        lsn = Lsn((int(lsn) // XLOG_BLCKSZ) * XLOG_BLCKSZ)
+
         with env.endpoints.create_start(
             branch_name="main",
             endpoint_id="static",
@@ -183,7 +188,8 @@ def generate_updates_on_main(
                 cur.execute("SELECT count(*) FROM t0")
                 assert cur.fetchone() == (ROW_COUNT,)
 
-            time.sleep(3)
+            # Wait for static compute to renew lease at least once.
+            time.sleep(LSN_LEASE_LENGTH / 2)
 
             generate_updates_on_main(env, ep_main, i, end=100)
 
@@ -204,8 +210,9 @@ def generate_updates_on_main(
         # Do some update so we can increment latest_gc_cutoff
         generate_updates_on_main(env, ep_main, i, end=100)
 
+    # Wait for the existing lease to expire.
+    time.sleep(LSN_LEASE_LENGTH)
     # Now trigger GC again, layers should be removed.
-    time.sleep(4)
     for shard, ps in tenant_get_shards(env, env.initial_tenant):
         client = ps.http_client()
         gc_result = client.timeline_gc(shard, env.initial_timeline, 0)

From 6f2333f52bfa1bce0039f9d013a885d5a76cf5ff Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 25 Sep 2024 19:07:20 +0300
Subject: [PATCH 70/77] CI: Leave out unnecessary build files from binary
 artifact (#9135)

The pg_install/build directory contains .o files and such intermediate
results from the build, which are not needed in the final tarball.
Except for src/test/regress/regress.so and a few other .so files in that
directory; keep those.

This reduces the size of the neon-Linux-X64-release-artifact.tar.zst
artifact from about 1.5 GB to 700 MB.

(I attempted this a long time ago already, by moving the build/
directory out of pg_install altogether, see PR #2127. But I never got
around to finish that work.)

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/_build-and-test-locally.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 67152b6991a0..5ea911eb952a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -257,7 +257,15 @@ jobs:
           ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
 
       - name: Install postgres binaries
-        run: cp -a pg_install /tmp/neon/pg_install
+        run: |
+          # Use tar to copy files matching the pattern, preserving the paths in the destionation
+          tar c \
+            pg_install/v* \
+            pg_install/build/*/src/test/regress/*.so \
+            pg_install/build/*/src/test/regress/pg_regress \
+            pg_install/build/*/src/test/isolation/isolationtester \
+            pg_install/build/*/src/test/isolation/pg_isolation_regress \
+            | tar  x -C /tmp/neon
 
       - name: Upload Neon artifact
         uses: ./.github/actions/upload

From 04f32b9526c83cc5e38ba5505bb310f5430c7402 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 25 Sep 2024 17:22:32 +0100
Subject: [PATCH 71/77] tests: remove patching up of az id column (#8968)

This was required since the compat tests used a snapshot generated from
a version of neon local which didn't contain the availability_zone_id
column.
---
 test_runner/fixtures/neon_fixtures.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 201eb1087ded..70fe632f4947 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -642,9 +642,6 @@ def ignore_postgres_log(path: str, _names):
         patch_script = ""
         for ps in self.env.pageservers:
             patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg}  WHERE node_id = '{ps.id}';"
-            # This is a temporary to get the backward compat test happy
-            # since the compat snapshot was generated with an older version of neon local
-            patch_script += f"UPDATE nodes SET availability_zone_id='{ps.az_id}'  WHERE node_id = '{ps.id}' AND availability_zone_id IS NULL;"
         patch_script_path.write_text(patch_script)
 
         # Update the config with info about tenants and timelines

From c6e89445e269d82296340cca55635fd01c3971eb Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 25 Sep 2024 18:22:39 +0100
Subject: [PATCH 72/77] CI(promote-images): fix prod ECR auth (#9146)

A cherry-pick from the previous release (#9131)

## Problem
Login to prod ECR doesn't work anymore:
```
Retrieving registries data through *** SDK...
*** ECR detected with eu-central-1 region
Error: The security token included in the request is invalid.
```

## Summary of changes
- Fix login to prod ECR by using `aws-actions/configure-aws-credentials`
---
 .github/workflows/build_and_test.yml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9dcc9709eb4c..81a9fd99ae09 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -862,6 +862,9 @@ jobs:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
     runs-on: ubuntu-22.04
 
+    permissions:
+      id-token: write # for `aws-actions/configure-aws-credentials`
+
     env:
       VERSIONS: v14 v15 v16 v17
 
@@ -906,13 +909,19 @@ jobs:
           docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
                                               neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
+      - name: Configure AWS-prod credentials
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: eu-central-1
+          mask-aws-account-id: true
+          role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
+
       - name: Login to prod ECR
         uses: docker/login-action@v3
         if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
         with:
           registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
-          password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
 
       - name: Copy all images to prod ECR
         if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'

From 6a4f49b08bf8e96ad62fd9ac1550acf3e58d4179 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:35:33 -0400
Subject: [PATCH 73/77] fix(pageserver): passthrough partition cancel error
 (#9154)

close https://github.com/neondatabase/neon/issues/9142

## Summary of changes

passthrough CollectKeyspaceError::Cancelled to
CompactionError::ShuttingDown

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs            | 18 ++++++++++++++----
 pageserver/src/tenant/timeline/compaction.rs |  2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d301ba23eafa..157c6ab91ec5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3601,7 +3601,7 @@ impl Timeline {
                     ctx,
                 )
                 .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
+                .map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
 
             if self.cancel.is_cancelled() {
                 return Err(FlushLayerError::Cancelled);
@@ -3840,16 +3840,20 @@ impl Timeline {
         partition_size: u64,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
+    ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
         let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
-            anyhow::bail!("repartition() called concurrently, this should not happen");
+            return Err(CompactionError::Other(anyhow!(
+                "repartition() called concurrently, this should not happen"
+            )));
         };
         let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
         if lsn < *partition_lsn {
-            anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
+            return Err(CompactionError::Other(anyhow!(
+                "repartition() called with LSN going backwards, this should not happen"
+            )));
         }
 
         let distance = lsn.0 - partition_lsn.0;
@@ -4451,6 +4455,12 @@ pub(crate) enum CompactionError {
     Other(anyhow::Error),
 }
 
+impl CompactionError {
+    pub fn is_cancelled(&self) -> bool {
+        matches!(self, CompactionError::ShuttingDown)
+    }
+}
+
 impl From<CollectKeySpaceError> for CompactionError {
     fn from(err: CollectKeySpaceError) -> Self {
         match err {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6edc28a11b2e..3de386a2d58f 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -390,7 +390,7 @@ impl Timeline {
                 // error but continue.
                 //
                 // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() {
+                if !self.cancel.is_cancelled() && !err.is_cancelled() {
                     tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
                 }
                 (1, false)

From 8ace9ea25ff59c2313d88934655f5c0e9edffa42 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 25 Sep 2024 12:45:05 -0500
Subject: [PATCH 74/77] Format long single DATA line in pgxn/Makefile

This should be a little more readable.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/Makefile | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index ddc8155ff33a..f1229b2d7351 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -25,7 +25,18 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
+DATA = \
+	neon--1.0.sql \
+	neon--1.0--1.1.sql \
+	neon--1.1--1.2.sql \
+	neon--1.2--1.3.sql \
+	neon--1.3--1.4.sql \
+	neon--1.4--1.5.sql \
+	neon--1.5--1.4.sql \
+	neon--1.4--1.3.sql \
+	neon--1.3--1.2.sql \
+	neon--1.2--1.1.sql \
+	neon--1.1--1.0.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \

From 684e924211a6a0cefe38b38ad1a5509ea01235a8 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 25 Sep 2024 13:18:34 -0500
Subject: [PATCH 75/77] Fix compute_logical_snapshot_files for v14

The function, pg_ls_logicalsnapdir(), was added in version 15.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/neon_collector.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml
index 29be0958dd20..acb17d3cc0f9 100644
--- a/compute/etc/neon_collector.yml
+++ b/compute/etc/neon_collector.yml
@@ -195,7 +195,7 @@ metrics:
       -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
       -- temporary snapshot files are renamed to the actual snapshot files after they are
       -- completely built. We only WAL-log the completely built snapshot files.
-      (SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+      (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
 
 # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
 # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
@@ -244,4 +244,3 @@ metrics:
     SELECT slot_name,
            CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
     FROM pg_replication_slots;
-

From 7e560dd00e7c63679aad50322f0b60d8ad32cfb0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 26 Sep 2024 00:29:16 +0300
Subject: [PATCH 76/77] chore: Silence clippy warning with nightly (#9157)

The warning:

    warning: first doc comment paragraph is too long
      --> pageserver/src/tenant/checks.rs:7:1
       |
7 | / /// Checks whether a layer map is valid (i.e., is a valid result
of the current compaction algorithm if no...
8 | | /// The function checks if we can split the LSN range of a delta
layer only at the LSNs of the delta layer...
    9  | | ///
    10 | | /// ```plain
       | |_
       |
= help: for further information visit
https://rust-lang.github.io/rust-clippy/master/index.html#too_long_first_doc_paragraph
= note: `#[warn(clippy::too_long_first_doc_paragraph)]` on by default
    help: add an empty line
       |
7 ~ /// Checks whether a layer map is valid (i.e., is a valid result of
the current compaction algorithm if nothing goes wrong).
    8  + ///
       |

Fix by applying the suggestion.
---
 pageserver/src/tenant/checks.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs
index 8eaa8a001c48..1e8fa8d1d64e 100644
--- a/pageserver/src/tenant/checks.rs
+++ b/pageserver/src/tenant/checks.rs
@@ -5,6 +5,7 @@ use itertools::Itertools;
 use super::storage_layer::LayerName;
 
 /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
+///
 /// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
 ///
 /// ```plain

From 7bae78186bac794bf46372ef8eeb2928d54ce6df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 26 Sep 2024 02:05:25 +0200
Subject: [PATCH 77/77] Forbid creation of child timelines of archived timeline
 (#9122)

We don't want to allow any new child timelines of archived timelines. If
you want any new child timelines, you should first un-archive the
timeline.

Part of #8088
---
 pageserver/src/http/routes.rs | 4 ++++
 pageserver/src/tenant.rs      | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6a10d4fb1c3e..ba38120bf1e6 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -589,6 +589,10 @@ async fn timeline_create_handler(
                 StatusCode::SERVICE_UNAVAILABLE,
                 HttpErrorBody::from_msg(e.to_string()),
             ),
+            Err(e @ tenant::CreateTimelineError::AncestorArchived) => json_response(
+                StatusCode::NOT_ACCEPTABLE,
+                HttpErrorBody::from_msg(e.to_string()),
+            ),
             Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
                 StatusCode::SERVICE_UNAVAILABLE,
                 HttpErrorBody::from_msg("tenant shutting down".to_string()),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 53cbaea621eb..2aebf4f99932 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -563,6 +563,8 @@ pub enum CreateTimelineError {
     AncestorLsn(anyhow::Error),
     #[error("ancestor timeline is not active")]
     AncestorNotActive,
+    #[error("ancestor timeline is archived")]
+    AncestorArchived,
     #[error("tenant shutting down")]
     ShuttingDown,
     #[error(transparent)]
@@ -1698,6 +1700,11 @@ impl Tenant {
                     return Err(CreateTimelineError::AncestorNotActive);
                 }
 
+                if ancestor_timeline.is_archived() == Some(true) {
+                    info!("tried to branch archived timeline");
+                    return Err(CreateTimelineError::AncestorArchived);
+                }
+
                 if let Some(lsn) = ancestor_start_lsn.as_mut() {
                     *lsn = lsn.align();