From db48f7e40de355268969b53df08730799cf906f5 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 10 Aug 2023 14:05:27 +0300
Subject: [PATCH 01/22] test: mark test_download_extensions.py skipped for now
 (#4948)

The test mutates a shared directory which does not work with multiple
concurrent tests. It is being fixed, so this should be a very temporary
band-aid.

Cc: #4949.
---
 test_runner/regress/test_download_extensions.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index c4bee1a1ea9db..b208616345e90 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -82,6 +82,7 @@ def upload_files(env):
 
 # Test downloading remote extension.
 @pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_remote_extensions(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
@@ -148,6 +149,7 @@ def test_remote_extensions(
 
 # Test downloading remote library.
 @pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_remote_library(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
@@ -205,10 +207,11 @@ def test_remote_library(
 # Here we test a complex extension
 # which has multiple extensions in one archive
 # using postgis as an example
-@pytest.mark.skipif(
-    RemoteStorageKind.REAL_S3 not in available_s3_storages(),
-    reason="skipping test because real s3 not enabled",
-)
+# @pytest.mark.skipif(
+#    RemoteStorageKind.REAL_S3 not in available_s3_storages(),
+#    reason="skipping test because real s3 not enabled",
+# )
+@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_multiple_extensions_one_archive(
     neon_env_builder: NeonEnvBuilder,
     pg_version: PgVersion,
@@ -253,6 +256,7 @@ def test_multiple_extensions_one_archive(
 # Run the test with mutliple simultaneous connections to an endpoint.
 # to ensure that the extension is downloaded only once.
 #
+@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_extension_download_after_restart(
     neon_env_builder: NeonEnvBuilder,
     pg_version: PgVersion,

From fa1f87b2685b478af1d3f76efa6eb8538dec2df7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 10 Aug 2023 13:43:13 +0200
Subject: [PATCH 02/22] Make the implementation of DiskBtreeReader::visit
 non-recursive (#4884)

## Problem

The `DiskBtreeReader::visit` function calls `read_blk` internally, and
while #4863 converted the API of `visit` to async, the internal function
is still recursive. So, analogously to #4838, we turn the recursive
function into an iterative one.

## Summary of changes

First, we prepare the change by moving the for loop outside of the case
switch, so that we only have one loop that calls recursion. Then, we
switch from using recursion to an approach where we store the search
path inside the tree on a stack on the heap.

The caller of the `visit` function can control when the search over the
B-Tree ends, by returning `false` from the closure. This is often used
to either only find one specific entry (by always returning `false`),
but it is also used to iterate over all entries of the B-tree (by always
returning `true`), or to look for ranges (mostly in tests, but
`get_value_reconstruct_data` also has such a use).

Each stack entry contains two things: the block number (aka the block's
offset), and a children iterator. The children iterator is constructed
depending on the search direction, and with the results of a binary
search over node's children list. It is the only thing that survives a
spilling/push to the stack, everything else is reconstructed. In other
words, each stack spill, will, if the search is still ongoing, cause an
entire re-parsing of the node. Theoretically, this would be a linear
overhead in the number of leaves the search visits. However, one needs
to note:

* the workloads to look for a specific entry are just visiting one leaf,
ever, so this is mostly about workloads that visit larger ranges,
including ones that visit the entire B-tree.
* the requests first hit the page cache, so often the cost is just in
terms of node deserialization
* for nodes that only have leaf nodes as children, no spilling to the
stack-on-heap happens (outside of the initial request where the iterator
is `None`). In other words, for balanced trees, the spilling overhead is
$\Theta\left(\frac{n}{b^2}\right)$, where `b` is the branching factor
and `n` is the number of nodes in the tree. The B-Trees in the current
implementation have a branching factor of roughly `PAGE_SZ/L` where
`PAGE_SZ` is 8192, and `L` is `DELTA_KEY_SIZE = 26` or `KEY_SIZE = 18`
in production code, so this gives us an estimate that we'd be re-loading
an inner node for every 99000 leaves in the B-tree in the worst case.

Due to these points above, I'd say that not fully caching the inner
nodes with inner children is reasonable, especially as we also want to
be fast for the "find one specific entry" workloads, where the stack
content is never accessed: any action to make the spilling
computationally more complex would contribute to wasted cycles here,
even if these workloads "only" spill one node for each depth level of
the b-tree (which is practically always a low single-digit number,
Kleppmann points out on page 81 that for branching factor 500, a four
level B-tree with 4 KB pages can store 250 TB of data).

But disclaimer, this is all stuff I thought about in my head, I have not
confirmed it with any benchmarks or data.

Builds on top of #4863, part of #4743
---
 pageserver/src/tenant/disk_btree.rs | 167 ++++++++++++----------------
 1 file changed, 69 insertions(+), 98 deletions(-)

diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index b163d27b421e5..d0ebd3eb4e69b 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -20,6 +20,7 @@
 //!
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
+use either::Either;
 use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
@@ -256,103 +257,77 @@ where
     where
         V: FnMut(&[u8], u64) -> bool,
     {
-        self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
-    }
-
-    fn search_recurse<V>(
-        &self,
-        node_blknum: u32,
-        search_key: &[u8; L],
-        dir: VisitDirection,
-        visitor: &mut V,
-    ) -> Result<bool>
-    where
-        V: FnMut(&[u8], u64) -> bool,
-    {
-        // Locate the node.
-        let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
-
-        let node = OnDiskNode::deparse(node_buf.as_ref())?;
-        let prefix_len = node.prefix_len as usize;
-        let suffix_len = node.suffix_len as usize;
-
-        assert!(node.num_children > 0);
-
-        let mut keybuf = Vec::new();
-        keybuf.extend(node.prefix);
-        keybuf.resize(prefix_len + suffix_len, 0);
-
-        if dir == VisitDirection::Forwards {
-            // Locate the first match
-            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                Ok(idx) => idx,
-                Err(idx) => {
-                    if node.level == 0 {
-                        // Imagine that the node contains the following keys:
-                        //
-                        // 1
-                        // 3  <-- idx
-                        // 5
-                        //
-                        // If the search key is '2' and there is exact match,
-                        // the binary search would return the index of key
-                        // '3'. That's cool, '3' is the first key to return.
-                        idx
-                    } else {
-                        // This is an internal page, so each key represents a lower
-                        // bound for what's in the child page. If there is no exact
-                        // match, we have to return the *previous* entry.
-                        //
-                        // 1  <-- return this
-                        // 3  <-- idx
-                        // 5
-                        idx.saturating_sub(1)
+        let mut stack = Vec::new();
+        stack.push((self.root_blk, None));
+        while let Some((node_blknum, opt_iter)) = stack.pop() {
+            // Locate the node.
+            let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
+
+            let node = OnDiskNode::deparse(node_buf.as_ref())?;
+            let prefix_len = node.prefix_len as usize;
+            let suffix_len = node.suffix_len as usize;
+
+            assert!(node.num_children > 0);
+
+            let mut keybuf = Vec::new();
+            keybuf.extend(node.prefix);
+            keybuf.resize(prefix_len + suffix_len, 0);
+
+            let mut iter = if let Some(iter) = opt_iter {
+                iter
+            } else if dir == VisitDirection::Forwards {
+                // Locate the first match
+                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                    Ok(idx) => idx,
+                    Err(idx) => {
+                        if node.level == 0 {
+                            // Imagine that the node contains the following keys:
+                            //
+                            // 1
+                            // 3  <-- idx
+                            // 5
+                            //
+                            // If the search key is '2' and there is exact match,
+                            // the binary search would return the index of key
+                            // '3'. That's cool, '3' is the first key to return.
+                            idx
+                        } else {
+                            // This is an internal page, so each key represents a lower
+                            // bound for what's in the child page. If there is no exact
+                            // match, we have to return the *previous* entry.
+                            //
+                            // 1  <-- return this
+                            // 3  <-- idx
+                            // 5
+                            idx.saturating_sub(1)
+                        }
                     }
-                }
-            };
-            // idx points to the first match now. Keep going from there
-            let mut key_off = idx * suffix_len;
-            while idx < node.num_children as usize {
-                let suffix = &node.keys[key_off..key_off + suffix_len];
-                keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx);
-                #[allow(clippy::collapsible_if)]
-                if node.level == 0 {
-                    // leaf
-                    if !visitor(&keybuf, value.to_u64()) {
-                        return Ok(false);
+                };
+                Either::Left(idx..node.num_children.into())
+            } else {
+                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                    Ok(idx) => {
+                        // Exact match. That's the first entry to return, and walk
+                        // backwards from there.
+                        idx
                     }
-                } else {
-                    #[allow(clippy::collapsible_if)]
-                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
-                        return Ok(false);
+                    Err(idx) => {
+                        // No exact match. The binary search returned the index of the
+                        // first key that's > search_key. Back off by one, and walk
+                        // backwards from there.
+                        if let Some(idx) = idx.checked_sub(1) {
+                            idx
+                        } else {
+                            return Ok(false);
+                        }
                     }
-                }
-                idx += 1;
-                key_off += suffix_len;
-            }
-        } else {
-            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                Ok(idx) => {
-                    // Exact match. That's the first entry to return, and walk
-                    // backwards from there. (The loop below starts from 'idx -
-                    // 1', so add one here to compensate.)
-                    idx + 1
-                }
-                Err(idx) => {
-                    // No exact match. The binary search returned the index of the
-                    // first key that's > search_key. Back off by one, and walk
-                    // backwards from there. (The loop below starts from idx - 1,
-                    // so we don't need to subtract one here)
-                    idx
-                }
+                };
+                Either::Right((0..=idx).rev())
             };
 
-            // idx points to the first match + 1 now. Keep going from there.
-            let mut key_off = idx * suffix_len;
-            while idx > 0 {
-                idx -= 1;
-                key_off -= suffix_len;
+            // idx points to the first match now. Keep going from there
+            while let Some(idx) = iter.next() {
+                let key_off = idx * suffix_len;
                 let suffix = &node.keys[key_off..key_off + suffix_len];
                 keybuf[prefix_len..].copy_from_slice(suffix);
                 let value = node.value(idx);
@@ -363,12 +338,8 @@ where
                         return Ok(false);
                     }
                 } else {
-                    #[allow(clippy::collapsible_if)]
-                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
-                        return Ok(false);
-                    }
-                }
-                if idx == 0 {
+                    stack.push((node_blknum, Some(iter)));
+                    stack.push((value.to_blknum(), None));
                     break;
                 }
             }

From 119b86480f9186545f37047801b9a2a97066865e Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Thu, 10 Aug 2023 08:24:43 -0400
Subject: [PATCH 03/22] test: make pg_regress less flaky, hopefully (#4903)

`pg_regress` is flaky: https://github.com/neondatabase/neon/issues/559

Consolidated `CHECKPOINT` to `check_restored_datadir_content`, add a
wait for `wait_for_last_flush_lsn`.

Some recently introduced flakyness was fixed with #4948.

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py  | 11 +++++++++--
 test_runner/regress/test_pg_regress.py |  9 ---------
 test_runner/regress/test_subxacts.py   |  4 ----
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7ba979745b922..901049188e94a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2817,8 +2817,15 @@ def check_restored_datadir_content(
     endpoint: Endpoint,
 ):
     # Get the timeline ID. We need it for the 'basebackup' command
-    timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
 
+    # many tests already checkpoint, but do it just in case
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CHECKPOINT")
+
+    # wait for pageserver to catch up
+    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
     # stop postgres to ensure that files won't change
     endpoint.stop()
 
@@ -2833,7 +2840,7 @@ def check_restored_datadir_content(
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.pageserver.service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
          | tar -x -C {restored_dir_path}
     """
 
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index ee600db2c60f7..f26d04e2f3021 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -56,10 +56,6 @@ def test_pg_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        # checkpoint one more time to ensure that the lsn we get is the latest one
-        endpoint.safe_psql("CHECKPOINT")
-
-        # Check that we restore the content of the datadir correctly
         check_restored_datadir_content(test_output_dir, env, endpoint)
 
 
@@ -166,9 +162,4 @@ def test_sql_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        # checkpoint one more time to ensure that the lsn we get is the latest one
-        endpoint.safe_psql("CHECKPOINT")
-        endpoint.safe_psql("select pg_current_wal_insert_lsn()")[0][0]
-
-        # Check that we restore the content of the datadir correctly
         check_restored_datadir_content(test_output_dir, env, endpoint)
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 494820ef8ef90..eb96a8faa47d4 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -33,8 +33,4 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
             cur.execute(f"insert into t1 values ({i}, {j})")
         cur.execute("commit")
 
-    # force wal flush
-    cur.execute("checkpoint")
-
-    # Check that we can restore the content of the datadir correctly
     check_restored_datadir_content(test_output_dir, env, endpoint)

From 71f9d9e5a32a4afc42071849a9fe0380ce0e560f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 10 Aug 2023 15:55:41 +0300
Subject: [PATCH 04/22] test: allow slow shutdown warning (#4953)

Introduced in #4886, did not consider that tests with real_s3 could
sometimes go over the limit. Do not fail tests because of that.
---
 pageserver/src/lib.rs                 | 1 +
 test_runner/fixtures/neon_fixtures.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index a3a9497d57f33..7acf7ed5d81cb 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -226,6 +226,7 @@ async fn timed<Fut: std::future::Future>(
 
             let ret = fut.await;
 
+            // this has a global allowed_errors
             tracing::warn!(
                 task = name,
                 elapsed_ms = started.elapsed().as_millis(),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 901049188e94a..a15c3724be56d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1518,6 +1518,8 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
             ".*Error processing HTTP request: NotFound: Timeline .* was not found",
             ".*took more than expected to complete.*",
+            # these can happen during shutdown, but it should not be a reason to fail a test
+            ".*completed, took longer than expected.*",
         ]
 
     def start(

From 17aea78aa77662f8c4158b341b3d23226301982a Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Thu, 10 Aug 2023 09:51:16 -0400
Subject: [PATCH 05/22] delete already present files from library index (#4955)

---
 compute_tools/src/extension_server.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index da5d519ab51fd..1b5e4cf974f88 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -156,7 +156,7 @@ pub async fn get_available_extensions(
     let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
     let mut enabled_extensions = ext_index_full.public_extensions;
     enabled_extensions.extend_from_slice(custom_extensions);
-    let library_index = ext_index_full.library_index;
+    let mut library_index = ext_index_full.library_index;
     let all_extension_data = ext_index_full.extension_data;
     info!("library_index: {:?}", library_index);
 
@@ -179,6 +179,8 @@ pub async fn get_available_extensions(
                 file_create_tasks.push(tokio::fs::write(control_path, control_contents));
             } else {
                 warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
+                // also delete this from library index
+                library_index.retain(|_, value| value != extension_name);
             }
         }
     }

From c58b22bacb1f2889955cf03f8f02666fd2a6c473 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 10 Aug 2023 18:53:16 +0300
Subject: [PATCH 06/22] Delete tenant's data from s3  (#4855)

## Summary of changes

For context see
https://github.com/neondatabase/neon/blob/main/docs/rfcs/022-pageserver-delete-from-s3.md

Create Flow to delete tenant's data from pageserver. The approach
heavily mimics previously implemented timeline deletion implemented
mostly in https://github.com/neondatabase/neon/pull/4384 and followed up
in https://github.com/neondatabase/neon/pull/4552

For remaining deletion related issues consult with deletion project
here: https://github.com/orgs/neondatabase/projects/33

resolves #4250
resolves https://github.com/neondatabase/neon/issues/3889

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/utils/src/fs_ext.rs                      |  37 +-
 pageserver/src/bin/pageserver.rs              |   2 +-
 pageserver/src/config.rs                      |   9 +-
 pageserver/src/http/openapi_spec.yml          |  42 ++
 pageserver/src/http/routes.rs                 |  35 +-
 pageserver/src/lib.rs                         |   2 +-
 pageserver/src/tenant.rs                      | 180 +++++-
 pageserver/src/tenant/delete.rs               | 546 ++++++++++++++++++
 pageserver/src/tenant/mgr.rs                  |  33 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 pageserver/src/tenant/timeline/delete.rs      |  59 +-
 test_runner/fixtures/neon_fixtures.py         |  12 +-
 test_runner/fixtures/pageserver/http.py       |   4 +
 test_runner/fixtures/pageserver/utils.py      |  83 ++-
 test_runner/fixtures/utils.py                 |  17 +-
 test_runner/regress/test_remote_storage.py    |   5 +
 test_runner/regress/test_tenant_delete.py     | 250 ++++++++
 test_runner/regress/test_tenant_detach.py     |   4 +
 .../test_tenants_with_remote_storage.py       |   5 +
 test_runner/regress/test_timeline_delete.py   |  38 +-
 20 files changed, 1252 insertions(+), 113 deletions(-)
 create mode 100644 pageserver/src/tenant/delete.rs
 create mode 100644 test_runner/regress/test_tenant_delete.py

diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index 090912d2765da..dfb7d5abbfa48 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,6 +24,20 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
     Ok(dir.next_entry().await?.is_none())
 }
 
+pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+
+    let mut content = vec![];
+    while let Some(next) = dir.next_entry().await? {
+        let file_name = next.file_name();
+        content.push(file_name.to_string_lossy().to_string());
+    }
+
+    Ok(content)
+}
+
 pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
     if e.kind() == io::ErrorKind::NotFound {
         Ok(())
@@ -43,7 +57,7 @@ where
 mod test {
     use std::path::PathBuf;
 
-    use crate::fs_ext::is_directory_empty;
+    use crate::fs_ext::{is_directory_empty, list_dir};
 
     use super::ignore_absent_files;
 
@@ -109,4 +123,25 @@ mod test {
 
         assert!(!file_path.exists());
     }
+
+    #[tokio::test]
+    async fn list_dir_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        assert!(list_dir(dir_path).await.unwrap().is_empty());
+
+        let file_path: PathBuf = dir_path.join("testfile");
+        let _ = std::fs::File::create(&file_path).unwrap();
+
+        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
+
+        let another_dir_path: PathBuf = dir_path.join("testdir");
+        std::fs::create_dir(another_dir_path).unwrap();
+
+        let expected = &["testdir", "testfile"];
+        let mut actual = list_dir(dir_path).await.unwrap();
+        actual.sort();
+        assert_eq!(actual, expected);
+    }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a86993d7f3907..e0c969279ee7d 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -373,7 +373,7 @@ fn start_pageserver(
     let order = pageserver::InitializationOrder {
         initial_tenant_load: Some(init_done_tx),
         initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: init_logical_size_done_tx,
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
         background_jobs_can_start: background_jobs_barrier.clone(),
     };
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index be806c77ec05e..f2aa2f365eb4c 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,7 +31,9 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
+use crate::tenant::{
+    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+};
 use crate::{
     IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
     TIMELINE_UNINIT_MARK_SUFFIX,
@@ -613,6 +615,11 @@ impl PageServerConf {
         )
     }
 
+    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_DELETED_MARKER_FILE_NAME)
+    }
+
     pub fn traces_path(&self) -> PathBuf {
         self.workdir.join("traces")
     }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 290492203a2b1..38e07f172d3d7 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,6 +93,47 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
+    delete:
+      description: |
+        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        404 means that deletion successfully finished"
+      responses:
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Tenant not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
 
   /v1/tenant/{tenant_id}/timeline:
     parameters:
@@ -820,6 +861,7 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
+
   /v1/tenant/config:
     put:
       description: |
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c10223beedefa..1e8dada85eb10 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                 format!("Cannot delete timeline which has child timelines: {children:?}")
                     .into_boxed_str(),
             ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
             Other(e) => ApiError::InternalServerError(e),
         }
     }
@@ -208,6 +208,19 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
     }
 }
 
+impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
+    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
+        use crate::tenant::delete::DeleteTenantError::*;
+        match value {
+            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
+            Timeline(t) => ApiError::from(t),
+            Other(o) => ApiError::InternalServerError(o),
+            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
@@ -617,6 +630,23 @@ async fn tenant_status(
     json_response(StatusCode::OK, tenant_info)
 }
 
+async fn tenant_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // TODO openapi spec
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
+        .instrument(info_span!("tenant_delete_handler", %tenant_id))
+        .await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 /// HTTP endpoint to query the current tenant_size of a tenant.
 ///
 /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -1345,6 +1375,9 @@ pub fn make_router(
         .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
         .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
         .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .delete("/v1/tenant/:tenant_id", |r| {
+            api_handler(r, tenant_delete_handler)
+        })
         .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
             api_handler(r, tenant_size_handler)
         })
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 7acf7ed5d81cb..b918dff193d0d 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -190,7 +190,7 @@ pub struct InitializationOrder {
 
     /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
     /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: utils::completion::Completion,
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
 
     /// Barrier for when we can start any background jobs.
     ///
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a9885a5f700c1..524e3a60a1c07 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,6 +28,7 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::fmt::Debug;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -46,8 +47,10 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::config::TenantConf;
+use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
+use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
@@ -106,6 +109,7 @@ macro_rules! pausable_failpoint {
 
 pub mod blob_io;
 pub mod block_io;
+
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
@@ -118,6 +122,7 @@ mod remote_timeline_client;
 pub mod storage_layer;
 
 pub mod config;
+pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
@@ -145,6 +150,8 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
 
 pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";
 
+pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
+
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -183,6 +190,8 @@ pub struct Tenant {
     cached_synthetic_tenant_size: Arc<AtomicU64>,
 
     eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
+
+    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }
 
 // We should not blindly overwrite local metadata with remote one.
@@ -274,7 +283,7 @@ pub enum LoadLocalTimelineError {
     ResumeDeletion(#[source] anyhow::Error),
 }
 
-#[derive(Debug, thiserror::Error)]
+#[derive(thiserror::Error)]
 pub enum DeleteTimelineError {
     #[error("NotFound")]
     NotFound,
@@ -283,17 +292,37 @@ pub enum DeleteTimelineError {
     HasChildren(Vec<TimelineId>),
 
     #[error("Timeline deletion is already in progress")]
-    AlreadyInProgress,
+    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
 
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
+impl Debug for DeleteTimelineError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NotFound => write!(f, "NotFound"),
+            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
+            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
+        }
+    }
+}
+
 pub enum SetStoppingError {
     AlreadyStopping(completion::Barrier),
     Broken,
 }
 
+impl Debug for SetStoppingError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
+            Self::Broken => write!(f, "Broken"),
+        }
+    }
+}
+
 struct RemoteStartupData {
     index_part: IndexPart,
     remote_metadata: TimelineMetadata,
@@ -616,7 +645,7 @@ impl Tenant {
         // For every timeline, download the metadata file, scan the local directory,
         // and build a layer map that contains an entry for each remote and local
         // layer file.
-        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
+        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
         for (timeline_id, remote_metadata) in sorted_timelines {
             let (index_part, remote_client) = remote_index_and_client
                 .remove(&timeline_id)
@@ -740,12 +769,13 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     #[instrument(skip_all, fields(tenant_id=%tenant_id))]
-    pub fn spawn_load(
+    pub(crate) fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
         init_order: Option<InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
         span::debug_assert_current_span_has_tenant_id();
@@ -765,7 +795,7 @@ impl Tenant {
             tenant_conf,
             wal_redo_manager,
             tenant_id,
-            remote_storage,
+            remote_storage.clone(),
         );
         let tenant = Arc::new(tenant);
 
@@ -781,27 +811,83 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
+                let make_broken = |t: &Tenant, err: anyhow::Error| {
+                    error!("load failed, setting tenant state to Broken: {err:?}");
+                    t.state.send_modify(|state| {
+                        assert!(
+                            matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
+                            "the loading task owns the tenant state until activation is complete"
+                        );
+                        *state = TenantState::broken_from_reason(err.to_string());
+                    });
+                };
+
                 let mut init_order = init_order;
 
                 // take the completion because initial tenant loading will complete when all of
                 // these tasks complete.
-                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
+                let _completion = init_order
+                    .as_mut()
+                    .and_then(|x| x.initial_tenant_load.take());
+
+                // Dont block pageserver startup on figuring out deletion status
+                let pending_deletion = {
+                    match DeleteTenantFlow::should_resume_deletion(
+                        conf,
+                        remote_storage.as_ref(),
+                        &tenant_clone,
+                    )
+                    .await
+                    {
+                        Ok(should_resume_deletion) => should_resume_deletion,
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                    }
+                };
+
+                info!("pending deletion {}", pending_deletion.is_some());
+
+                if let Some(deletion) = pending_deletion {
+                    // as we are no longer loading, signal completion by dropping
+                    // the completion while we resume deletion
+                    drop(_completion);
+                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
+                    let _ = init_order
+                        .as_mut()
+                        .and_then(|x| x.initial_logical_size_attempt.take());
+
+                    match DeleteTenantFlow::resume(
+                        deletion,
+                        &tenant_clone,
+                        init_order.as_ref(),
+                        tenants,
+                        &ctx,
+                    )
+                    .await
+                    {
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                        Ok(()) => return Ok(()),
+                    }
+                }
+
+                let background_jobs_can_start =
+                    init_order.as_ref().map(|x| &x.background_jobs_can_start);
 
                 match tenant_clone.load(init_order.as_ref(), &ctx).await {
                     Ok(()) => {
-                        debug!("load finished, activating");
-                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                        debug!("load finished",);
+
                         tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                     }
-                    Err(err) => {
-                        error!("load failed, setting tenant state to Broken: {err:?}");
-                        tenant_clone.state.send_modify(|state| {
-                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
-                            *state = TenantState::broken_from_reason(err.to_string());
-                        });
-                    }
+                    Err(err) => make_broken(&tenant_clone, err),
                 }
-               Ok(())
+
+                Ok(())
             }
             .instrument({
                 let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -877,6 +963,8 @@ impl Tenant {
                         )
                     })?;
 
+                info!("Found deletion mark for timeline {}", timeline_id);
+
                 match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
                     Ok(metadata) => {
                         timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
@@ -966,9 +1054,11 @@ impl Tenant {
 
         // Sort the array of timeline IDs into tree-order, so that parent comes before
         // all its children.
-        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
-            sorted_timelines_to_load: sorted_timelines,
-            timelines_to_resume_deletion,
+        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
+            TenantDirectoryScan {
+                sorted_timelines_to_load: sorted_timelines,
+                timelines_to_resume_deletion,
+            }
         })
     }
 
@@ -1682,7 +1772,7 @@ impl Tenant {
         // It's mesed up.
         // we just ignore the failure to stop
 
-        match self.set_stopping(shutdown_progress).await {
+        match self.set_stopping(shutdown_progress, false).await {
             Ok(()) => {}
             Err(SetStoppingError::Broken) => {
                 // assume that this is acceptable
@@ -1722,18 +1812,25 @@ impl Tenant {
     /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
     ///
     /// This function is not cancel-safe!
-    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
+    ///
+    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
+    async fn set_stopping(
+        &self,
+        progress: completion::Barrier,
+        allow_transition_from_loading: bool,
+    ) -> Result<(), SetStoppingError> {
         let mut rx = self.state.subscribe();
 
         // cannot stop before we're done activating, so wait out until we're done activating
         rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                 info!(
                     "waiting for {} to turn Active|Broken|Stopping",
                     <&'static str>::from(state)
                 );
                 false
             }
+            TenantState::Loading => allow_transition_from_loading,
             TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
         })
         .await
@@ -1742,9 +1839,16 @@ impl Tenant {
         // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
         let mut err = None;
         let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                 unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
             }
+            TenantState::Loading => {
+                if !allow_transition_from_loading {
+                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+                };
+                *current_state = TenantState::Stopping { progress };
+                true
+            }
             TenantState::Active => {
                 // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                 // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -1813,6 +1917,10 @@ impl Tenant {
         .expect("cannot drop self.state while on a &self method");
 
         // we now know we're done activating, let's see whether this task is the winner to transition into Broken
+        self.set_broken_no_wait(reason)
+    }
+
+    pub(crate) fn set_broken_no_wait(&self, reason: String) {
         self.state.send_modify(|current_state| {
             match *current_state {
                 TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
@@ -1878,22 +1986,28 @@ impl Tenant {
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
 /// perform a topological sort, so that the parent of each timeline comes
 /// before the children.
-fn tree_sort_timelines(
-    timelines: HashMap<TimelineId, TimelineMetadata>,
-) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
+/// E extracts the ancestor from T
+/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
+fn tree_sort_timelines<T, E>(
+    timelines: HashMap<TimelineId, T>,
+    extractor: E,
+) -> anyhow::Result<Vec<(TimelineId, T)>>
+where
+    E: Fn(&T) -> Option<TimelineId>,
+{
     let mut result = Vec::with_capacity(timelines.len());
 
     let mut now = Vec::with_capacity(timelines.len());
     // (ancestor, children)
-    let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
+    let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
         HashMap::with_capacity(timelines.len());
 
-    for (timeline_id, metadata) in timelines {
-        if let Some(ancestor_id) = metadata.ancestor_timeline() {
+    for (timeline_id, value) in timelines {
+        if let Some(ancestor_id) = extractor(&value) {
             let children = later.entry(ancestor_id).or_default();
-            children.push((timeline_id, metadata));
+            children.push((timeline_id, value));
         } else {
-            now.push((timeline_id, metadata));
+            now.push((timeline_id, value));
         }
     }
 
@@ -2062,7 +2176,7 @@ impl Tenant {
             remote_client,
             pg_version,
             initial_logical_size_can_start.cloned(),
-            initial_logical_size_attempt.cloned(),
+            initial_logical_size_attempt.cloned().flatten(),
             state,
         );
 
@@ -2146,6 +2260,7 @@ impl Tenant {
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
             eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
         }
     }
 
@@ -2162,6 +2277,7 @@ impl Tenant {
         // FIXME If the config file is not found, assume that we're attaching
         // a detached tenant and config is passed via attach command.
         // https://github.com/neondatabase/neon/issues/1555
+        // OR: we're loading after incomplete deletion that managed to remove config.
         if !target_config_path.exists() {
             info!("tenant config not found in {target_config_display}");
             return Ok(TenantConfOpt::default());
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
new file mode 100644
index 0000000000000..bdeb117c7953d
--- /dev/null
+++ b/pageserver/src/tenant/delete.rs
@@ -0,0 +1,546 @@
+use std::{
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use anyhow::Context;
+use pageserver_api::models::TenantState;
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use tokio::sync::OwnedMutexGuard;
+use tracing::{error, info, instrument, warn, Instrument, Span};
+
+use utils::{
+    completion, crashsafe, fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    context::RequestContext,
+    task_mgr::{self, TaskKind},
+    InitializationOrder,
+};
+
+use super::{
+    mgr::{GetTenantError, TenantsMap},
+    span,
+    timeline::delete::DeleteTimelineFlow,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
+};
+
+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3;
+
+#[derive(Debug, thiserror::Error)]
+pub enum DeleteTenantError {
+    #[error("GetTenant {0}")]
+    Get(#[from] GetTenantError),
+
+    #[error("Invalid state {0}. Expected Active or Broken")]
+    InvalidState(TenantState),
+
+    #[error("Tenant deletion is already in progress")]
+    AlreadyInProgress,
+
+    #[error("Timeline {0}")]
+    Timeline(#[from] DeleteTimelineError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
+
+fn remote_tenant_delete_mark_path(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> anyhow::Result<RemotePath> {
+    let tenant_remote_path = conf
+        .tenant_path(tenant_id)
+        .strip_prefix(&conf.workdir)
+        .context("Failed to strip workdir prefix")
+        .and_then(RemotePath::new)
+        .context("tenant path")?;
+    Ok(tenant_remote_path.join(Path::new("deleted")))
+}
+
+async fn create_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+
+    let data: &[u8] = &[];
+    remote_storage
+        .upload(data, 0, &remote_mark_path, None)
+        .await
+        .context("mark upload")?;
+
+    Ok(())
+}
+
+async fn create_local_delete_mark(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+
+    Ok(())
+}
+
+async fn schedule_ordered_timeline_deletions(
+    tenant: &Arc<Tenant>,
+) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
+    // Tenant is stopping at this point. We know it will be deleted.
+    // No new timelines should be created.
+    // Tree sort timelines to delete from leafs to the root.
+    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
+    // can complete and remove timeline from the map in between our call to clone
+    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
+    // timelines.lock is currently synchronous so we cant hold it across await point.
+    // So just ignore NotFound error if we get it from `run`.
+    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
+    let timelines = tenant.timelines.lock().unwrap().clone();
+    let sorted =
+        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
+
+    let mut already_running_deletions = vec![];
+
+    for (timeline_id, _) in sorted.into_iter().rev() {
+        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
+            match e {
+                DeleteTimelineError::NotFound => {
+                    // Timeline deletion finished after call to clone above but before call
+                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
+                    continue;
+                }
+                DeleteTimelineError::AlreadyInProgress(guard) => {
+                    already_running_deletions.push((guard, timeline_id));
+                    continue;
+                }
+                e => return Err(DeleteTenantError::Timeline(e)),
+            }
+        }
+    }
+
+    Ok(already_running_deletions)
+}
+
+async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
+    // Assert timelines dir is empty.
+    if !fs_ext::is_directory_empty(timelines_path).await? {
+        // Display first 10 items in directory
+        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
+        return Err(DeleteTenantError::Other(anyhow::anyhow!(
+            "Timelines directory is not empty after all timelines deletion: {list:?}"
+        )));
+    }
+
+    Ok(())
+}
+
+async fn remove_tenant_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: Option<&GenericRemoteStorage>,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    if let Some(remote_storage) = remote_storage {
+        remote_storage
+            .delete(&remote_tenant_delete_mark_path(conf, tenant_id)?)
+            .await?;
+    }
+    Ok(())
+}
+
+// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
+async fn cleanup_remaining_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let rm = |p: PathBuf, is_dir: bool| async move {
+        if is_dir {
+            tokio::fs::remove_dir(&p).await
+        } else {
+            tokio::fs::remove_file(&p).await
+        }
+        .or_else(fs_ext::ignore_not_found)
+        .with_context(|| {
+            let to_display = p.display();
+            format!("failed to delete {to_display}")
+        })
+    };
+
+    rm(conf.tenant_config_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-timelines-dir"
+        ))?
+    });
+
+    rm(conf.timelines_path(tenant_id), true).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-deleted-mark"
+        ))?
+    });
+
+    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-tenant-dir"
+        ))?
+    });
+
+    rm(conf.tenant_path(tenant_id), true).await?;
+
+    Ok(())
+}
+
+/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Upload remote deletion mark.
+/// 2. Create local mark file.
+/// 3. Shutdown tasks
+/// 4. Run ordered timeline deletions
+/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
+/// 6. Remove remote mark
+/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are three entrypoints to the process:
+/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there.
+/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
+#[derive(Default)]
+pub enum DeleteTenantFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTenantFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    // NOTE: static needed for background part.
+    // We assume that calling code sets up the span with tenant_id.
+    #[instrument(skip_all)]
+    pub(crate) async fn run(
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(), DeleteTenantError> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
+
+        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+            tenant.set_broken(format!("{e:#}")).await;
+            return Err(e);
+        }
+
+        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+
+        Ok(())
+    }
+
+    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
+    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
+    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
+    // So the solution is to set tenant state to broken.
+    async fn run_inner(
+        guard: &mut OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<(), DeleteTenantError> {
+        guard.mark_in_progress()?;
+
+        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-remote-mark"
+            ))?
+        });
+
+        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
+        // Though sounds scary, different mark name?
+        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
+        if let Some(remote_storage) = &remote_storage {
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
+                .await
+                .context("remote_mark")?
+        }
+
+        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-local-mark"
+            ))?
+        });
+
+        create_local_delete_mark(conf, &tenant.tenant_id)
+            .await
+            .context("local delete mark")?;
+
+        fail::fail_point!("tenant-delete-before-background", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-background"
+            ))?
+        });
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    pub async fn should_resume_deletion(
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
+        let acquire = |t: &Tenant| {
+            Some(
+                Arc::clone(&t.delete_progress)
+                    .try_lock_owned()
+                    .expect("we're the only owner during init"),
+            )
+        };
+
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+            return Ok(acquire(tenant));
+        }
+
+        // If remote storage is there we rely on it
+        if let Some(remote_storage) = remote_storage {
+            let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
+
+            let attempt = 1;
+            loop {
+                match remote_storage.download(&remote_mark_path).await {
+                    Ok(_) => return Ok(acquire(tenant)),
+                    Err(e) => {
+                        if matches!(e, DownloadError::NotFound) {
+                            return Ok(None);
+                        }
+                        if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS {
+                            return Err(anyhow::anyhow!(e))?;
+                        }
+
+                        warn!(
+                            "failed to fetch tenant deletion mark at {} attempt {}",
+                            &remote_mark_path, attempt
+                        )
+                    }
+                }
+            }
+        }
+
+        Ok(None)
+    }
+
+    pub(crate) async fn resume(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, true)
+            .await
+            .expect("cant be stopping or broken");
+
+        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
+        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+        if let Some(background) = background_jobs_can_start {
+            info!("waiting for backgound jobs barrier");
+            background.clone().wait().await;
+            info!("ready for backgound jobs barrier");
+        }
+
+        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
+        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
+        if timelines_path.exists() {
+            tenant.load(init_order, ctx).await.context("load")?;
+        }
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
+    async fn prepare(
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
+        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
+        // so at least for now allow deletions only for active tenants. TODO recheck
+        // Broken and Stopping is needed for retries.
+        if !matches!(
+            tenant.current_state(),
+            TenantState::Active | TenantState::Broken { .. }
+        ) {
+            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        }
+
+        let guard = Arc::clone(&tenant.delete_progress)
+            .try_lock_owned()
+            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+
+        fail::fail_point!("tenant-delete-before-shutdown", |_| {
+            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+        });
+
+        // make pageserver shutdown not to wait for our completion
+        let (_, progress) = completion::channel();
+
+        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
+        // i e it is an error to do:
+        // tenant.set_stopping
+        // tenant.shutdown
+        // Its also bad that we're holding tenants.read here.
+        // TODO relax set_stopping to be idempotent?
+        if tenant.shutdown(progress, false).await.is_err() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "tenant shutdown is already in progress"
+            )));
+        }
+
+        Ok((Arc::clone(tenant), guard))
+    }
+
+    fn schedule_background(
+        guard: OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+    ) {
+        let tenant_id = tenant.tenant_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            None,
+            "tenant_delete",
+            false,
+            async move {
+                if let Err(err) =
+                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
+                {
+                    error!("Error: {err:#}");
+                    tenant.set_broken(format!("{err:#}")).await;
+                };
+                Ok(())
+            }
+            .instrument({
+                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: OwnedMutexGuard<Self>,
+        conf: &PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: &Arc<Tenant>,
+    ) -> Result<(), DeleteTenantError> {
+        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
+        // Note that if deletion fails we dont mark timelines as broken,
+        // the whole tenant will become broken as by `Self::schedule_background` logic
+        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
+            .await
+            .context("schedule_ordered_timeline_deletions")?;
+
+        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-polling-ongoing-deletions"
+            ))?
+        });
+
+        // Wait for deletions that were already running at the moment when tenant deletion was requested.
+        // When we can lock deletion guard it means that corresponding timeline deletion finished.
+        for (guard, timeline_id) in already_running_timeline_deletions {
+            let flow = guard.lock().await;
+            if !flow.is_finished() {
+                return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                    "already running timeline deletion failed: {timeline_id}"
+                )));
+            }
+        }
+
+        let timelines_path = conf.timelines_path(&tenant.tenant_id);
+        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
+        if timelines_path.exists() {
+            // sanity check to guard against layout changes
+            ensure_timelines_dir_empty(&timelines_path)
+                .await
+                .context("timelines dir not empty")?;
+        }
+
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+
+        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
+            ))?
+        });
+
+        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
+            .await
+            .context("cleanup_remaining_fs_traces")?;
+
+        let mut locked = tenants.write().await;
+        if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
+        };
+
+        *guard = Self::Finished;
+
+        Ok(())
+    }
+}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 2635953e6dabb..ae6d237066c42 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,17 +20,19 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
+use super::delete::DeleteTenantError;
 use super::timeline::delete::DeleteTimelineFlow;
 
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
-enum TenantsMap {
+pub(crate) enum TenantsMap {
     /// [`init_tenant_mgr`] is not done yet.
     Initializing,
     /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
@@ -42,13 +44,13 @@ enum TenantsMap {
 }
 
 impl TenantsMap {
-    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
         match self {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
         }
     }
-    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
         match self {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
@@ -97,7 +99,9 @@ pub async fn init_tenant_mgr(
                         );
                     }
                 } else {
-                    // This case happens if we crash during attach before creating the attach marker file
+                    // This case happens if we:
+                    // * crash during attach before creating the attach marker file
+                    // * crash during tenant delete before removing tenant directory
                     let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
                         format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
                     })?;
@@ -124,6 +128,7 @@ pub async fn init_tenant_mgr(
                         broker_client.clone(),
                         remote_storage.clone(),
                         Some(init_order.clone()),
+                        &TENANTS,
                         &ctx,
                     ) {
                         Ok(tenant) => {
@@ -154,12 +159,13 @@ pub async fn init_tenant_mgr(
     Ok(())
 }
 
-pub fn schedule_local_tenant_processing(
+pub(crate) fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     init_order: Option<InitializationOrder>,
+    tenants: &'static tokio::sync::RwLock<TenantsMap>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -219,6 +225,7 @@ pub fn schedule_local_tenant_processing(
             broker_client,
             remote_storage,
             init_order,
+            tenants,
             ctx,
         )
     };
@@ -356,7 +363,7 @@ pub async fn create_tenant(
         //       See https://github.com/neondatabase/neon/issues/4233
 
         let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -417,6 +424,14 @@ pub async fn get_tenant(
     }
 }
 
+pub async fn delete_tenant(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant_id: TenantId,
+) -> Result<(), DeleteTenantError> {
+    DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
     #[error("Tenant {0}")]
@@ -432,7 +447,7 @@ pub async fn delete_timeline(
     _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
     let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
     Ok(())
 }
 
@@ -507,7 +522,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -588,7 +603,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b2d29a1e6f98c..2a5db0516a968 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -919,7 +919,7 @@ impl Timeline {
     pub fn set_state(&self, new_state: TimelineState) {
         match (self.current_state(), new_state) {
             (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
             }
             (st, TimelineState::Loading) => {
                 error!("ignoring transition from {st:?} into Loading state");
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d99ca2d2927c2..dba6475c27587 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -219,27 +219,13 @@ async fn delete_local_layer_files(
             }
         };
 
-        let r = if metadata.is_dir() {
-            // There shouldnt be any directories inside timeline dir as of current layout.
+        if metadata.is_dir() {
+            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
             tokio::fs::remove_dir(entry.path()).await
         } else {
             tokio::fs::remove_file(entry.path()).await
-        };
-
-        if let Err(e) = r {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                warn!(
-                    timeline_dir=?local_timeline_directory,
-                    path=?entry.path().display(),
-                    "got not found err while removing timeline dir, proceeding anyway"
-                );
-                continue;
-            }
-            anyhow::bail!(anyhow::anyhow!(
-                "Failed to remove: {}. Error: {e}",
-                entry.path().display()
-            ));
         }
+        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
     }
 
     info!("finished deleting layer files, releasing layer_removal_cs.lock()");
@@ -359,10 +345,11 @@ impl DeleteTimelineFlow {
     // NB: If this fails half-way through, and is retried, the retry will go through
     // all the same steps again. Make sure the code here is idempotent, and don't
     // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
     pub async fn run(
         tenant: &Arc<Tenant>,
         timeline_id: TimelineId,
+        inplace: bool,
     ) -> Result<(), DeleteTimelineError> {
         let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
 
@@ -380,7 +367,11 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }
 
         Ok(())
     }
@@ -398,6 +389,8 @@ impl DeleteTimelineFlow {
     }
 
     /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
+    #[instrument(skip_all, fields(%timeline_id))]
     pub async fn resume_deletion(
         tenant: Arc<Tenant>,
         timeline_id: TimelineId,
@@ -444,11 +437,15 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
+    #[instrument(skip_all, fields(%timeline_id))]
     pub async fn cleanup_remaining_timeline_fs_traces(
         tenant: &Tenant,
         timeline_id: TimelineId,
     ) -> anyhow::Result<()> {
-        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+        let r =
+            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
+        info!("Done");
+        r
     }
 
     fn prepare(
@@ -494,11 +491,17 @@ impl DeleteTimelineFlow {
         // At the end of the operation we're holding the guard and need to lock timelines map
         // to remove the timeline from it.
         // Always if you have two locks that are taken in different order this can result in a deadlock.
-        let delete_lock_guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-        );
+
+        let delete_progress = Arc::clone(&timeline.delete_progress);
+        let delete_lock_guard = match delete_progress.try_lock_owned() {
+            Ok(guard) => DeletionGuard(guard),
+            Err(_) => {
+                // Unfortunately if lock fails arc is consumed.
+                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
+                    &timeline.delete_progress,
+                )));
+            }
+        };
 
         timeline.set_state(TimelineState::Stopping);
 
@@ -553,10 +556,14 @@ impl DeleteTimelineFlow {
 
         remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
 
-        *guard.0 = Self::Finished;
+        *guard = Self::Finished;
 
         Ok(())
     }
+
+    pub(crate) fn is_finished(&self) -> bool {
+        matches!(self, Self::Finished)
+    }
 }
 
 struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a15c3724be56d..7c7ca941639d2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -32,6 +32,7 @@
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
+from mypy_boto3_s3 import S3Client
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -432,7 +433,7 @@ def __init__(
         self.port_distributor = port_distributor
         self.remote_storage = remote_storage
         self.ext_remote_storage: Optional[S3Storage] = None
-        self.remote_storage_client: Optional[Any] = None
+        self.remote_storage_client: Optional[S3Client] = None
         self.remote_storage_users = remote_storage_users
         self.broker = broker
         self.run_id = run_id
@@ -875,7 +876,14 @@ def get_safekeeper_connstrs(self) -> str:
 
     def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
         """Get a timeline directory's path based on the repo directory of the test environment"""
-        return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
+
+    def tenant_dir(
+        self,
+        tenant_id: TenantId,
+    ) -> Path:
+        """Get a tenant directory's path based on the repo directory of the test environment"""
+        return self.repo_dir / "tenants" / str(tenant_id)
 
     def get_pageserver_version(self) -> str:
         bin_pageserver = str(self.neon_binpath / "pageserver")
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 8c053c8073abc..a179ebdd09aeb 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -210,6 +210,10 @@ def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
         self.verbose_error(res)
 
+    def tenant_delete(self, tenant_id: TenantId):
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        self.verbose_error(res)
+
     def tenant_load(self, tenant_id: TenantId):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
         self.verbose_error(res)
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 119c99bb96846..76777eca0860d 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,9 +1,11 @@
 import time
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.remote_storage import RemoteStorageKind, S3Storage
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 
 
 def assert_tenant_state(
@@ -199,20 +201,19 @@ def wait_timeline_detail_404(
     timeline_id: TimelineId,
     iterations: int,
 ):
-    last_exc = None
-    for _ in range(iterations):
-        time.sleep(0.250)
+    def timeline_is_missing():
+        data = {}
         try:
             data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.info(f"detail {data}")
+            log.info(f"timeline detail {data}")
         except PageserverApiException as e:
             log.debug(e)
             if e.status_code == 404:
                 return
 
-            last_exc = e
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")
 
-    raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}")
+    wait_until(iterations, interval=0.250, func=timeline_is_missing)
 
 
 def timeline_delete_wait_completed(
@@ -224,3 +225,71 @@ def timeline_delete_wait_completed(
 ):
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
     wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
+
+
+if TYPE_CHECKING:
+    # TODO avoid by combining remote storage related stuff in single type
+    # and just passing in this type instead of whole builder
+    from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
+    assert neon_env_builder.remote_storage_kind in (
+        RemoteStorageKind.MOCK_S3,
+        RemoteStorageKind.REAL_S3,
+    )
+    # For mypy
+    assert isinstance(neon_env_builder.remote_storage, S3Storage)
+    assert neon_env_builder.remote_storage_client is not None
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Bucket=neon_env_builder.remote_storage.bucket_name,
+        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
+    )
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
+def wait_tenant_status_404(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+):
+    def tenant_is_missing():
+        data = {}
+        try:
+            data = pageserver_http.tenant_status(tenant_id)
+            log.info(f"tenant status {data}")
+        except PageserverApiException as e:
+            log.debug(e)
+            if e.status_code == 404:
+                return
+
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")
+
+    wait_until(iterations, interval=0.250, func=tenant_is_missing)
+
+
+def tenant_delete_wait_completed(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+):
+    pageserver_http.tenant_delete(tenant_id=tenant_id)
+    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
+
+
+MANY_SMALL_LAYERS_TENANT_CONFIG = {
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    "checkpoint_distance": f"{1024**2}",
+    "image_creation_threshold": "100",
+}
+
+
+def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
+    return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 2c8b4f4303ae0..81754c95f7a32 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -6,13 +6,16 @@
 import tarfile
 import time
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Tuple, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar
 from urllib.parse import urlencode
 
 import allure
 from psycopg2.extensions import cursor
 
 from fixtures.log_helper import log
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import PgBin
 from fixtures.types import TimelineId
 
 Fn = TypeVar("Fn", bound=Callable[..., Any])
@@ -314,3 +317,15 @@ def wait_while(number_of_iterations: int, interval: float, func):
         except Exception:
             return
     raise Exception("timed out while waiting for %s" % func)
+
+
+def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
+    """
+    Fast way to populate data.
+    For more layers consider combining with these tenant settings:
+    {
+        "checkpoint_distance": 1024 ** 2,
+        "image_creation_threshold": 100,
+    }
+    """
+    pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 68116985d3f01..bfe9046ecabfc 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -97,6 +97,11 @@ def test_remote_storage_backup_and_restore(
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
     timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )
+
     checkpoint_numbers = range(1, 3)
 
     for checkpoint_number in checkpoint_numbers:
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
new file mode 100644
index 0000000000000..06189053316d8
--- /dev/null
+++ b/test_runner/regress/test_tenant_delete.py
@@ -0,0 +1,250 @@
+import enum
+import os
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import (
+    MANY_SMALL_LAYERS_TENANT_CONFIG,
+    assert_prefix_empty,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
+    wait_tenant_status_404,
+    wait_until_tenant_active,
+    wait_until_tenant_state,
+)
+from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
+from fixtures.types import TenantId
+from fixtures.utils import run_pg_bench_small
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
+)
+def test_tenant_delete_smoke(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_bin: PgBin,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_delete_smoke",
+    )
+
+    env = neon_env_builder.init_start()
+
+    ps_http = env.pageserver.http_client()
+
+    # first try to delete non existing tenant
+    tenant_id = TenantId.generate()
+    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
+    with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
+        ps_http.tenant_delete(tenant_id=tenant_id)
+
+    env.neon_cli.create_tenant(
+        tenant_id=tenant_id,
+        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
+    )
+
+    # create two timelines one being the parent of another
+    parent = None
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_branch(
+            timeline, tenant_id=tenant_id, ancestor_branch_name=parent
+        )
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
+
+        parent = timeline
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+
+    tenant_path = env.tenant_dir(tenant_id=tenant_id)
+    assert not tenant_path.exists()
+
+    if remote_storage_kind in [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]:
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+
+class Check(enum.Enum):
+    RETRY_WITHOUT_RESTART = enum.auto()
+    RETRY_WITH_RESTART = enum.auto()
+
+
+FAILPOINTS = [
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+    "tenant-delete-before-polling-ongoing-deletions",
+    "tenant-delete-before-cleanup-remaining-fs-traces",
+    "tenant-delete-before-remove-timelines-dir",
+    "tenant-delete-before-remove-deleted-mark",
+    "tenant-delete-before-remove-tenant-dir",
+    # Some failpoints from timeline deletion
+    "timeline-delete-before-index-deleted-at",
+    "timeline-delete-before-rm",
+    "timeline-delete-before-index-delete",
+    "timeline-delete-after-rm-dir",
+]
+
+FAILPOINTS_BEFORE_BACKGROUND = [
+    "timeline-delete-before-schedule",
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+]
+
+
+def combinations():
+    result = []
+
+    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
+        remotes.append(RemoteStorageKind.REAL_S3)
+
+    for remote_storage_kind in remotes:
+        for delete_failpoint in FAILPOINTS:
+            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
+                "timeline-delete-before-index-delete",
+            ):
+                # the above failpoint are not relevant for config without remote storage
+                continue
+
+            result.append((remote_storage_kind, delete_failpoint))
+    return result
+
+
+@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
+@pytest.mark.parametrize("check", list(Check))
+def test_delete_tenant_exercise_crash_safety_failpoints(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    failpoint: str,
+    check: Check,
+    pg_bin: PgBin,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
+    )
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    env.pageserver.allowed_errors.extend(
+        [
+            # From deletion polling
+            f".*NotFound: tenant {env.initial_tenant}.*",
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # We may leave some upload tasks in the queue. They're likely deletes.
+            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
+            # So by ignoring these instead of waiting for empty upload queue
+            # we execute more distinct code paths.
+            '.*stopping left-over name="remote upload".*',
+        ]
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
+    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
+        # generate enough layers
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        if remote_storage_kind is RemoteStorageKind.NOOP:
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        else:
+            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    # These failpoints are earlier than background task is spawned.
+    # so they result in api request failure.
+    if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
+        with pytest.raises(PageserverApiException, match=failpoint):
+            ps_http.tenant_delete(tenant_id)
+
+    else:
+        ps_http.tenant_delete(tenant_id)
+        tenant_info = wait_until_tenant_state(
+            pageserver_http=ps_http,
+            tenant_id=tenant_id,
+            expected_state="Broken",
+            iterations=iterations,
+        )
+
+        reason = tenant_info["state"]["data"]["reason"]
+        log.info(f"tenant broken: {reason}")
+
+        # failpoint may not be the only error in the stack
+        assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    if check is Check.RETRY_WITH_RESTART:
+        env.pageserver.stop()
+        env.pageserver.start()
+
+        if (
+            remote_storage_kind is RemoteStorageKind.NOOP
+            and failpoint == "tenant-delete-before-create-local-mark"
+        ):
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        elif failpoint in (
+            "tenant-delete-before-shutdown",
+            "tenant-delete-before-create-remote-mark",
+        ):
+            wait_until_tenant_active(
+                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
+            )
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        else:
+            # Pageserver should've resumed deletion after restart.
+            wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
+    elif check is Check.RETRY_WITHOUT_RESTART:
+        # this should succeed
+        # this also checks that delete can be retried even when tenant is in Broken state
+        ps_http.configure_failpoints((failpoint, "off"))
+
+        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+
+    # Check remote is impty
+    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    tenant_dir = env.tenant_dir(tenant_id)
+    # Check local is empty
+    assert not tenant_dir.exists()
+
+
+# TODO test concurrent deletions with "hang" failpoint
+# TODO test tenant delete continues after attach
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 932d8219543e0..b189510a9e412 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -66,6 +66,10 @@ def test_tenant_reattach(
     env.pageserver.allowed_errors.append(
         f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
     )
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )
 
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         with endpoint.cursor() as cur:
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 79a3b353d4586..fee95e5420c9d 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -146,6 +146,11 @@ def test_tenants_attached_after_download(
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
     timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )
+
     for checkpoint_number in range(1, 3):
         with endpoint.cursor() as cur:
             cur.execute(
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 1e8dd3620618e..26caeb8ffb689 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -4,7 +4,6 @@
 import shutil
 import threading
 from pathlib import Path
-from typing import Optional
 
 import pytest
 import requests
@@ -18,6 +17,8 @@
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
+    assert_prefix_empty,
+    poll_for_remote_storage_iterations,
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -27,7 +28,6 @@
 )
 from fixtures.remote_storage import (
     RemoteStorageKind,
-    S3Storage,
     available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -187,10 +187,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
     8. Retry or restart without the failpoint and check the result.
     """
 
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_remote_storage(
-            remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
-        )
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
+    )
 
     env = neon_env_builder.init_start(
         initial_tenant_conf={
@@ -231,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
 
     ps_http.configure_failpoints((failpoint, "return"))
 
-    iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
 
     # These failpoints are earlier than background task is spawned.
     # so they result in api request failure.
@@ -280,14 +279,14 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
                         "remote_storage_s3_request_seconds_count",
                         filter={"request_type": "get_object", "result": "err"},
                     ).value
-                    == 1
+                    == 2  # One is missing tenant deletion mark, second is missing index part
                 )
                 assert (
                     m.query_one(
                         "remote_storage_s3_request_seconds_count",
                         filter={"request_type": "get_object", "result": "ok"},
                     ).value
-                    == 1
+                    == 1  # index part for initial timeline
                 )
     elif check is Check.RETRY_WITHOUT_RESTART:
         # this should succeed
@@ -413,27 +412,6 @@ def test_timeline_resurrection_on_attach(
     assert all([tl["state"] == "Active" for tl in timelines])
 
 
-def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None):
-    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
-    assert neon_env_builder.remote_storage_kind in (
-        RemoteStorageKind.MOCK_S3,
-        RemoteStorageKind.REAL_S3,
-    )
-    # For mypy
-    assert isinstance(neon_env_builder.remote_storage, S3Storage)
-
-    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
-    assert neon_env_builder.remote_storage_client is not None
-    response = neon_env_builder.remote_storage_client.list_objects_v2(
-        Bucket=neon_env_builder.remote_storage.bucket_name,
-        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
-    )
-    objects = response.get("Contents")
-    assert (
-        response["KeyCount"] == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
-
-
 def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
     """
     When deleting a timeline, if we succeed in setting the deleted flag remotely

From 538373019a837613caffa98dca72422b07c9c595 Mon Sep 17 00:00:00 2001
From: George MacKerron <george@mackerron.co.uk>
Date: Thu, 10 Aug 2023 17:21:52 +0100
Subject: [PATCH 07/22] Increase max sql-over-http response size from 1MB to
 10MB (#4961)

## Problem

1MB response limit is very small.

## Summary of changes

This data is not yet tracked, so we shoudn't raise the limit too high yet.
But as discussed with @kelvich and @conradludgate, this PR lifts it to
10MB, and adds also details of the limit to the error response.
---
 proxy/src/http/sql_over_http.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index a4eb13a10ba07..736413c85c3b1 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -34,7 +34,7 @@ enum Payload {
     Batch(Vec<QueryData>),
 }
 
-pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -214,7 +214,7 @@ pub async fn handle(
 
     if request_content_length > MAX_REQUEST_SIZE {
         return Err(anyhow::anyhow!(
-            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
+            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
         ));
     }
 
@@ -292,13 +292,15 @@ async fn query_to_json<T: GenericClient>(
     // big.
     pin_mut!(row_stream);
     let mut rows: Vec<tokio_postgres::Row> = Vec::new();
-    let mut curret_size = 0;
+    let mut current_size = 0;
     while let Some(row) = row_stream.next().await {
         let row = row?;
-        curret_size += row.body_len();
+        current_size += row.body_len();
         rows.push(row);
-        if curret_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!("response too large"));
+        if current_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!(
+                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
+            ));
         }
     }
 

From 25c66dc6351f1dff1c46d158618eab79b63040b5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 10 Aug 2023 17:49:24 +0100
Subject: [PATCH 08/22] proxy: http logging to 11 (#4950)

## Problem

Mysterious network issues

## Summary of changes

Log a lot more about HTTP/DNS in hopes of detecting more of the network
errors
---
 proxy/src/console/provider/neon.rs |  5 +++
 proxy/src/http.rs                  | 50 ++++++++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 22e766b5f13b9..3f4cee6e3483c 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,6 +8,7 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
 
@@ -47,7 +48,9 @@ impl Api {
                 .build()?;
 
             info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
             let response = self.endpoint.execute(request).await?;
+            info!(duration = ?start.elapsed(), "received http response");
             let body = match parse_body::<GetRoleSecret>(response).await {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
@@ -88,7 +91,9 @@ impl Api {
                 .build()?;
 
             info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
             let response = self.endpoint.execute(request).await?;
+            info!(duration = ?start.elapsed(), "received http response");
             let body = parse_body::<WakeCompute>(response).await?;
 
             // Unfortunately, ownership won't let us use `Option::ok_or` here.
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index e2812403806d5..755137807bc49 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -7,11 +7,14 @@ pub mod server;
 pub mod sql_over_http;
 pub mod websocket;
 
-use std::time::Duration;
+use std::{sync::Arc, time::Duration};
 
+use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time::Instant;
+use tracing::trace;
 
 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -20,13 +23,21 @@ use reqwest_middleware::RequestBuilder;
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
-    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
+    let client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
+        .build()
+        .expect("Failed to create http client");
+
+    reqwest_middleware::ClientBuilder::new(client)
         .with(reqwest_tracing::TracingMiddleware::default())
         .build()
 }
 
 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
         .timeout(default_timout)
         .build()
         .expect("Failed to create http client with timeout");
@@ -39,6 +50,10 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware
         // As per docs, "This middleware always errors when given requests with streaming bodies".
         // That's all right because we only use this client to send `serde_json::RawValue`, which
         // is not a stream.
+        //
+        // ex-maintainer note:
+        // this limitation can be fixed if streaming is necessary.
+        // retries will still not be performed, but it wont error immediately
         .with(RetryTransientMiddleware::new_with_policy(retry_policy))
         .build()
 }
@@ -81,6 +96,37 @@ impl Endpoint {
     }
 }
 
+/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
+use hyper::{
+    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
+    service::Service,
+};
+use reqwest::dns::{Addrs, Resolve, Resolving};
+#[derive(Debug)]
+pub struct GaiResolver(HyperGaiResolver);
+
+impl Default for GaiResolver {
+    fn default() -> Self {
+        Self(HyperGaiResolver::new())
+    }
+}
+
+impl Resolve for GaiResolver {
+    fn resolve(&self, name: Name) -> Resolving {
+        let this = &mut self.0.clone();
+        let start = Instant::now();
+        Box::pin(
+            Service::<Name>::call(this, name.clone()).map(move |result| {
+                let resolve_duration = start.elapsed();
+                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
+                result
+                    .map(|addrs| -> Addrs { Box::new(addrs) })
+                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
+            }),
+        )
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

From 3a71cf38c1306f93a69ac088e30e28d9b912d446 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Fri, 11 Aug 2023 04:04:45 +0900
Subject: [PATCH 09/22] Grant BypassRLS to new neon_superuser roles (#4935)

---
 compute_tools/src/spec.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index eff7c93b461af..74e7796be78d6 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             }
             RoleAction::Create => {
                 let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("role create query: '{}'", &query);

From 73d7a9bc6e734559d37c8f97d9cf2ea61da40dce Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 10 Aug 2023 22:38:22 +0200
Subject: [PATCH 10/22] proxy: propagate ws span (#4966)

Found this log on staging:
```
2023-08-10T17:42:58.573790Z  INFO handling interactive connection from client protocol="ws"
```

We seem to be losing websocket span in spawn, this patch fixes it.
---
 proxy/src/http/websocket.rs | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index fec76c74f4943..071382cfb9f5a 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -187,12 +187,16 @@ async fn ws_handler(
         let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
-        tokio::spawn(async move {
-            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
-            {
-                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
+        tokio::spawn(
+            async move {
+                if let Err(e) =
+                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
+                {
+                    error!(session_id = ?session_id, "error in websocket connection: {e:?}");
+                }
             }
-        });
+            .in_current_span(),
+        );
 
         // Return the response so the spawned future can continue.
         Ok(response)

From d39fd667737e66ec635b58f71fa4de3409f6d1ea Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Fri, 11 Aug 2023 10:18:13 +0300
Subject: [PATCH 11/22] tests: remove redundant wait_while (#4952)

Remove redundant `wait_while` in tests. It had only one usage. Use
`wait_tenant_status404`.

Related:
https://github.com/neondatabase/neon/pull/4855#discussion_r1289610641
---
 test_runner/fixtures/pageserver/utils.py      | 12 ++----------
 test_runner/fixtures/utils.py                 | 16 ----------------
 test_runner/regress/test_tenant_relocation.py | 19 +++++++++----------
 3 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 76777eca0860d..f5859550accae 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -19,15 +19,6 @@ def assert_tenant_state(
     assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
 
 
-def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
-    tenants = pageserver_http.tenant_list()
-    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
-    assert len(matching) < 2
-    if len(matching) == 0:
-        return None
-    return matching[0]
-
-
 def remote_consistent_lsn(
     pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
@@ -258,6 +249,7 @@ def wait_tenant_status_404(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
     iterations: int,
+    interval: float = 0.250,
 ):
     def tenant_is_missing():
         data = {}
@@ -271,7 +263,7 @@ def tenant_is_missing():
 
         raise RuntimeError(f"Timeline exists state {data.get('state')}")
 
-    wait_until(iterations, interval=0.250, func=tenant_is_missing)
+    wait_until(iterations, interval=interval, func=tenant_is_missing)
 
 
 def tenant_delete_wait_completed(
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 81754c95f7a32..d03d2e7595f4c 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -303,22 +303,6 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
     raise Exception("timed out while waiting for %s" % func) from last_exception
 
 
-def wait_while(number_of_iterations: int, interval: float, func):
-    """
-    Wait until 'func' returns false, or throws an exception.
-    """
-    for i in range(number_of_iterations):
-        try:
-            if not func():
-                return
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            time.sleep(interval)
-            continue
-        except Exception:
-            return
-    raise Exception("timed out while waiting for %s" % func)
-
-
 def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
     """
     Fast way to populate data.
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index eb020c101f4cd..32ad5381b45be 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -17,9 +17,9 @@
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
-    tenant_exists,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_tenant_status_404,
 )
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
@@ -29,7 +29,6 @@
     start_in_background,
     subprocess_capture,
     wait_until,
-    wait_while,
 )
 
 
@@ -269,11 +268,16 @@ def test_tenant_relocation(
 
     env = neon_env_builder.init_start()
 
+    tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
+
     # FIXME: Is this expected?
     env.pageserver.allowed_errors.append(
         ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
     )
 
+    # Needed for detach polling.
+    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
+
     # create folder for remote storage mock
     remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage"
 
@@ -283,9 +287,7 @@ def test_tenant_relocation(
 
     pageserver_http = env.pageserver.http_client()
 
-    tenant_id, initial_timeline_id = env.neon_cli.create_tenant(
-        TenantId("74ee8b079a0e437eb0afea7d26a07209")
-    )
+    _, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
     log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)
 
     env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id)
@@ -469,11 +471,8 @@ def test_tenant_relocation(
         pageserver_http.tenant_detach(tenant_id)
 
         # Wait a little, so that the detach operation has time to finish.
-        wait_while(
-            number_of_iterations=100,
-            interval=1,
-            func=lambda: tenant_exists(pageserver_http, tenant_id),
-        )
+        wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
+
         post_migration_check(ep_main, 500500, old_local_path_main)
         post_migration_check(ep_second, 1001000, old_local_path_second)
 

From 3a6b99f03c66d2ea9bb5dd6ff20bec1ab3e2e6a4 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 11 Aug 2023 17:18:07 +0200
Subject: [PATCH 12/22] proxy: improve http logs (#4976)

Fix multiline logs on websocket errors and always print sql-over-http
errors sent to the user.
---
 proxy/src/http/websocket.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index 071382cfb9f5a..794da1792940f 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -192,7 +192,7 @@ async fn ws_handler(
                 if let Err(e) =
                     serve_websocket(websocket, config, &cancel_map, session_id, host).await
                 {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:?}");
+                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
                 }
             }
             .in_current_span(),
@@ -221,6 +221,10 @@ async fn ws_handler(
                     },
                     None => Value::Null,
                 };
+                error!(
+                    ?code,
+                    "sql-over-http per-client task finished with an error: {e:#}"
+                );
                 (
                     json!({ "message": message, "code": code }),
                     HashMap::default(),

From 9ffccb55f1443fd34145b066c04f75bb42229837 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 11 Aug 2023 18:01:02 +0200
Subject: [PATCH 13/22] InMemoryLayer: move end_lsn out of the lock (#4963)

## Problem

In some places, the lock on `InMemoryLayerInner` is only created to
obtain `end_lsn`. This is not needed however, if we move `end_lsn` to
`InMemoryLayer` instead.

## Summary of changes

Make `end_lsn` a member of `InMemoryLayer`, and do less locking of
`InMemoryLayerInner`. `end_lsn` is changed from `Option<Lsn>` into an
`OnceLock<Lsn>`. Thanks to this change, we don't need to lock any more
in three functions.

Part of #4743 . Suggested in
https://github.com/neondatabase/neon/pull/4905#issuecomment-1666458428 .
---
 .../tenant/storage_layer/inmemory_layer.rs    | 75 ++++++++-----------
 1 file changed, 31 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 3d222fcb1ea32..5b825eee6cfac 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -16,6 +16,7 @@ use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::cell::RefCell;
 use std::collections::HashMap;
+use std::sync::OnceLock;
 use tracing::*;
 use utils::{
     bin_ser::BeSer,
@@ -42,14 +43,16 @@ pub struct InMemoryLayer {
     tenant_id: TenantId,
     timeline_id: TimelineId,
 
-    ///
     /// This layer contains all the changes from 'start_lsn'. The
     /// start is inclusive.
-    ///
     start_lsn: Lsn,
 
-    /// The above fields never change. The parts that do change are in 'inner',
-    /// and protected by mutex.
+    /// Frozen layers have an exclusive end LSN.
+    /// Writes are only allowed when this is `None`.
+    end_lsn: OnceLock<Lsn>,
+
+    /// The above fields never change, except for `end_lsn`, which is only set once.
+    /// All other changing parts are in `inner`, and protected by a mutex.
     inner: RwLock<InMemoryLayerInner>,
 }
 
@@ -57,21 +60,16 @@ impl std::fmt::Debug for InMemoryLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("InMemoryLayer")
             .field("start_lsn", &self.start_lsn)
+            .field("end_lsn", &self.end_lsn)
             .field("inner", &self.inner)
             .finish()
     }
 }
 
 pub struct InMemoryLayerInner {
-    /// Frozen layers have an exclusive end LSN.
-    /// Writes are only allowed when this is None
-    end_lsn: Option<Lsn>,
-
-    ///
     /// All versions of all pages in the layer are kept here.  Indexed
     /// by block number and LSN. The value is an offset into the
     /// ephemeral file where the page version is stored.
-    ///
     index: HashMap<Key, VecMap<Lsn, u64>>,
 
     /// The values are stored in a serialized format in this file.
@@ -82,15 +80,7 @@ pub struct InMemoryLayerInner {
 
 impl std::fmt::Debug for InMemoryLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InMemoryLayerInner")
-            .field("end_lsn", &self.end_lsn)
-            .finish()
-    }
-}
-
-impl InMemoryLayerInner {
-    fn assert_writeable(&self) {
-        assert!(self.end_lsn.is_none());
+        f.debug_struct("InMemoryLayerInner").finish()
     }
 }
 
@@ -101,13 +91,21 @@ impl InMemoryLayer {
 
     pub fn info(&self) -> InMemoryLayerInfo {
         let lsn_start = self.start_lsn;
-        let lsn_end = self.inner.read().unwrap().end_lsn;
 
-        match lsn_end {
-            Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
-            None => InMemoryLayerInfo::Open { lsn_start },
+        if let Some(&lsn_end) = self.end_lsn.get() {
+            InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
+        } else {
+            InMemoryLayerInfo::Open { lsn_start }
         }
     }
+
+    fn assert_writable(&self) {
+        assert!(self.end_lsn.get().is_none());
+    }
+
+    fn end_lsn_or_max(&self) -> Lsn {
+        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
+    }
 }
 
 #[async_trait::async_trait]
@@ -117,14 +115,7 @@ impl Layer for InMemoryLayer {
     }
 
     fn get_lsn_range(&self) -> Range<Lsn> {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
-            end_lsn
-        } else {
-            Lsn(u64::MAX)
-        };
-        self.start_lsn..end_lsn
+        self.start_lsn..self.end_lsn_or_max()
     }
 
     fn is_incremental(&self) -> bool {
@@ -136,11 +127,7 @@ impl Layer for InMemoryLayer {
     async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
         let inner = self.inner.read().unwrap();
 
-        let end_str = inner
-            .end_lsn
-            .as_ref()
-            .map(Lsn::to_string)
-            .unwrap_or_default();
+        let end_str = self.end_lsn_or_max();
 
         println!(
             "----- in-memory layer for tli {} LSNs {}-{} ----",
@@ -236,9 +223,7 @@ impl Layer for InMemoryLayer {
 
 impl std::fmt::Display for InMemoryLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        let end_lsn = self.end_lsn_or_max();
         write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
     }
 }
@@ -270,8 +255,8 @@ impl InMemoryLayer {
             timeline_id,
             tenant_id,
             start_lsn,
+            end_lsn: OnceLock::new(),
             inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
                 index: HashMap::new(),
                 file,
             }),
@@ -285,7 +270,7 @@ impl InMemoryLayer {
     pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
         let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
+        self.assert_writable();
 
         let off = {
             SER_BUFFER.with(|x| -> Result<_> {
@@ -317,10 +302,10 @@ impl InMemoryLayer {
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
     pub fn freeze(&self, end_lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
+        let inner = self.inner.write().unwrap();
 
         assert!(self.start_lsn < end_lsn);
-        inner.end_lsn = Some(end_lsn);
+        self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
             for (lsn, _pos) in vec_map.as_slice() {
@@ -344,12 +329,14 @@ impl InMemoryLayer {
         // rare though, so we just accept the potential latency hit for now.
         let inner = self.inner.read().unwrap();
 
+        let end_lsn = *self.end_lsn.get().unwrap();
+
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
             self.tenant_id,
             Key::MIN,
-            self.start_lsn..inner.end_lsn.unwrap(),
+            self.start_lsn..end_lsn,
         )?;
 
         let mut buf = Vec::new();

From cc2d00fea4952d30865a193648fdad4e0574f1ef Mon Sep 17 00:00:00 2001
From: Felix Prasanna <91577249+fprasx@users.noreply.github.com>
Date: Fri, 11 Aug 2023 14:26:53 -0400
Subject: [PATCH 14/22] bump vm-builder version to v0.15.4 (#4980)

Patches a bug in vm-builder where it did not include enough parameters
in the query string. These parameters are `host=localhost port=5432`.
These parameters were not necessary for the monitor because the `pq` go
postgres driver included them by default.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 69b5e2262008e..853f5f291910e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -814,7 +814,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.15.0-alpha1
+      VM_BUILDER_VERSION: v0.15.4
 
     steps:
       - name: Checkout

From 81738135846e6226e743a9764193867e4222c722 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 8 Aug 2023 11:54:40 +0300
Subject: [PATCH 15/22] Add term=n option to safekeeper START_REPLICATION
 command.

It allows term leader to ensure he pulls data from the correct term. Absense of
it wasn't very problematic due to CRC checks, but let's be strict.

walproposer still doesn't use it as we're going to remove recovery completely
from it.
---
 safekeeper/src/handler.rs                | 29 ++++++++++------
 safekeeper/src/send_wal.rs               | 27 +++++++++++++--
 safekeeper/src/timeline.rs               | 13 +++++++
 test_runner/regress/test_wal_acceptor.py | 43 ++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 14 deletions(-)

diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index f11075a736cab..136d62b321026 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,8 +2,8 @@
 //! protocol commands.
 
 use anyhow::Context;
-use std::str;
 use std::str::FromStr;
+use std::str::{self};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span, Instrument};
 
@@ -11,6 +11,7 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
 
 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
+use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -44,7 +45,7 @@ pub struct SafekeeperPostgresHandler {
 /// Parsed Postgres command.
 enum SafekeeperPostgresCommand {
     StartWalPush,
-    StartReplication { start_lsn: Lsn },
+    StartReplication { start_lsn: Lsn, term: Option<Term> },
     IdentifySystem,
     TimelineStatus,
     JSONCtrl { cmd: AppendLogicalMessage },
@@ -55,15 +56,21 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
         Ok(SafekeeperPostgresCommand::StartWalPush)
     } else if cmd.starts_with("START_REPLICATION") {
         let re = Regex::new(
-            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
+            // We follow postgres START_REPLICATION LOGICAL options to pass term.
+            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)(?: \(term='(\d+)'\))?",
         )
         .unwrap();
-        let mut caps = re.captures_iter(cmd);
-        let start_lsn = caps
-            .next()
-            .map(|cap| Lsn::from_str(&cap[1]))
-            .context("parse start LSN from START_REPLICATION command")??;
-        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
+        let caps = re
+            .captures(cmd)
+            .context(format!("failed to parse START_REPLICATION command {}", cmd))?;
+        let start_lsn =
+            Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?;
+        let term = if let Some(m) = caps.get(2) {
+            Some(m.as_str().parse::<u64>().context("invalid term")?)
+        } else {
+            None
+        };
+        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn, term })
     } else if cmd.starts_with("IDENTIFY_SYSTEM") {
         Ok(SafekeeperPostgresCommand::IdentifySystem)
     } else if cmd.starts_with("TIMELINE_STATUS") {
@@ -218,8 +225,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                     .instrument(info_span!("WAL receiver", ttid = %span_ttid))
                     .await
             }
-            SafekeeperPostgresCommand::StartReplication { start_lsn } => {
-                self.handle_start_replication(pgb, start_lsn)
+            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
+                self.handle_start_replication(pgb, start_lsn, term)
                     .instrument(info_span!("WAL sender", ttid = %span_ttid))
                     .await
             }
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index a0d4bb35d709a..808177d2c439a 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,6 +2,7 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.
 
 use crate::handler::SafekeeperPostgresHandler;
+use crate::safekeeper::Term;
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
@@ -359,8 +360,12 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
+        term: Option<Term>,
     ) -> Result<(), QueryError> {
-        if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await {
+        if let Err(end) = self
+            .handle_start_replication_guts(pgb, start_pos, term)
+            .await
+        {
             // Log the result and probably send it to the client, closing the stream.
             pgb.handle_copy_stream_end(end).await;
         }
@@ -371,6 +376,7 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
+        term: Option<Term>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
         let tli =
@@ -440,6 +446,7 @@ impl SafekeeperPostgresHandler {
             start_pos,
             end_pos,
             stop_pos,
+            term,
             commit_lsn_watch_rx,
             ws_guard: ws_guard.clone(),
             wal_reader,
@@ -476,6 +483,10 @@ struct WalSender<'a, IO> {
     // If present, terminate after reaching this position; used by walproposer
     // in recovery.
     stop_pos: Option<Lsn>,
+    /// When streaming uncommitted part, the term the client acts as the leader
+    /// in. Streaming is stopped if local term changes to a different (higher)
+    /// value.
+    term: Option<Term>,
     commit_lsn_watch_rx: Receiver<Lsn>,
     ws_guard: Arc<WalSenderGuard>,
     wal_reader: WalReader,
@@ -518,8 +529,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 .0 as usize;
             send_size = min(send_size, self.send_buf.len());
             let send_buf = &mut self.send_buf[..send_size];
-            // read wal into buffer
-            send_size = self.wal_reader.read(send_buf).await?;
+            let send_size: usize;
+            {
+                // If uncommitted part is being pulled, check that the term is
+                // still the expected one.
+                let _term_guard = if let Some(t) = self.term {
+                    Some(self.tli.acquire_term(t).await?)
+                } else {
+                    None
+                };
+                // read wal into buffer
+                send_size = self.wal_reader.read(send_buf).await?
+            };
             let send_buf = &send_buf[..send_size];
 
             // and send it
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 9ec89b86d4375..df51c45affbf5 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -499,6 +499,19 @@ impl Timeline {
         false
     }
 
+    /// Ensure taht current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
+        let ss = self.write_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
     /// Returns whether s3 offloading is required and sets current status as
     /// matching it.
     pub async fn wal_backup_attend(&self) -> bool {
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 87f076d236927..9132efe79f276 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -869,6 +869,49 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     assert debug_dump_1["config"]["id"] == env.safekeepers[0].id
 
 
+class DummyConsumer(object):
+    def __call__(self, msg):
+        pass
+
+
+def test_start_replication_term(neon_env_builder: NeonEnvBuilder):
+    """
+    Test START_REPLICATION of uncommitted part specifying leader term. It must
+    error if safekeeper switched to different term.
+    """
+
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_start_replication_term")
+    endpoint = env.endpoints.create_start("test_start_replication_term")
+
+    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
+
+    # learn neon timeline from compute
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+
+    sk = env.safekeepers[0]
+    sk_http_cli = sk.http_client()
+    tli_status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    timeline_start_lsn = tli_status.timeline_start_lsn
+
+    conn_opts = {
+        "host": "127.0.0.1",
+        "options": f"-c timeline_id={timeline_id} tenant_id={tenant_id}",
+        "port": sk.port.pg,
+        "connection_factory": psycopg2.extras.PhysicalReplicationConnection,
+    }
+    sk_pg_conn = psycopg2.connect(**conn_opts)  # type: ignore
+    with sk_pg_conn.cursor() as cur:
+        # should fail, as first start has term 2
+        cur.start_replication_expert(f"START_REPLICATION {timeline_start_lsn} (term='3')")
+        dummy_consumer = DummyConsumer()
+        with pytest.raises(psycopg2.errors.InternalError_) as excinfo:
+            cur.consume_stream(dummy_consumer)
+        assert "failed to acquire term 3" in str(excinfo.value)
+
+
 # Test auth on WAL service (postgres protocol) ports.
 def test_sk_auth(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True

From 763f5c0641b3013b4aaf7bebec250a34cb486546 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 14 Aug 2023 10:49:51 +0200
Subject: [PATCH 16/22] Remove dead code from walproposer_utils.c (#4525)

This code was mostly copied from walsender.c and the idea was to keep it
similar to walsender.c, so that we can easily copy-paste future upstream
changes to walsender.c to waproposer_utils.c, too. But right now I see that
deleting it doesn't break anything, so it's better to remove unused parts.
---
 pgxn/neon/walproposer_utils.c | 657 ++++------------------------------
 1 file changed, 61 insertions(+), 596 deletions(-)

diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c
index e1dcaa081d04f..9e1fc11756a20 100644
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -37,68 +37,14 @@ static XLogSegNo walpropSegNo = 0;
 
 /* START cloned file-local variables and functions from walsender.c */
 
-/*
- * xlogreader used for replication.  Note that a WAL sender doing physical
- * replication does not need xlogreader to read WAL, but it needs one to
- * keep a state of its work.
- */
-static XLogReaderState *xlogreader = NULL;
-
-/*
- * These variables keep track of the state of the timeline we're currently
- * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
- * the timeline is not the latest timeline on this server, and the server's
- * history forked off from that timeline at sendTimeLineValidUpto.
- */
-static TimeLineID sendTimeLine = 0;
-static TimeLineID sendTimeLineNextTLI = 0;
-static bool sendTimeLineIsHistoric = false;
-static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
-
-/*
- * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
- * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
- */
-static TimestampTz last_reply_timestamp = 0;
-
-/* Have we sent a heartbeat message asking for reply, since last reply? */
-static bool waiting_for_ping_response = false;
-
-static bool streamingDoneSending;
-static bool streamingDoneReceiving;
-
-/* Are we there yet? */
-static bool WalSndCaughtUp = false;
-
-/* Flags set by signal handlers for later service in main loop */
-static volatile sig_atomic_t got_STOPPING = false;
-
 /*
  * How far have we sent WAL already? This is also advertised in
  * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
  */
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
 
-/*
- * This is set while we are streaming. When not set
- * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
- * the main loop is responsible for checking got_STOPPING and terminating when
- * it's set (after streaming any remaining WAL).
- */
-static volatile sig_atomic_t replication_active = false;
-
-typedef void (*WalSndSendDataCallback) (void);
-static void WalSndLoop(WalSndSendDataCallback send_data);
-static void XLogSendPhysical(void);
-#if PG_VERSION_NUM >= 150000
-static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
-#else
-static XLogRecPtr GetStandbyFlushRecPtr(void);
-#endif
-
-static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-							  TimeLineID *tli_p);
-
+static void WalSndLoop(void);
+static void XLogBroadcastWalProposer(void);
 /* END cloned file-level variables and functions from walsender.c */
 
 int
@@ -506,7 +452,7 @@ XLogWalPropClose(XLogRecPtr recptr)
 /* START of cloned functions from walsender.c */
 
 /*
- * Handle START_REPLICATION command.
+ * Subscribe for new WAL and stream it in the loop to safekeepers.
  *
  * At the moment, this never returns, but an ereport(ERROR) will take us back
  * to the main loop.
@@ -524,18 +470,6 @@ StartProposerReplication(StartReplicationCmd *cmd)
 				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
 #endif
 
-	/* create xlogreader for physical replication */
-	xlogreader =
-		XLogReaderAllocate(wal_segment_size, NULL,
-						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
-									  .segment_close = wal_segment_close),
-						   NULL);
-
-	if (!xlogreader)
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-
 	/*
 	 * We assume here that we're logging enough information in the WAL for
 	 * log-shipping, since this is checked in PostmasterMain().
@@ -569,341 +503,61 @@ StartProposerReplication(StartReplicationCmd *cmd)
 	 * we keep this code around to lighten the load for when we need it.
 	 */
 #if PG_VERSION_NUM >= 150000
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr(&currTLI);
-	}
-	else
-		FlushPtr = GetFlushRecPtr(&currTLI);
+	FlushPtr = GetFlushRecPtr(&currTLI);
 #else
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr();
-	}
-	else
-		FlushPtr = GetFlushRecPtr();
-
+	FlushPtr = GetFlushRecPtr();
 	currTLI = ThisTimeLineID;
 #endif
 
+	/*
+	 * When we first start replication the standby will be behind the
+	 * primary. For some applications, for example synchronous
+	 * replication, it is important to have a clear state for this initial
+	 * catchup mode, so we can trigger actions when we change streaming
+	 * state later. We may stay in this state for a long time, which is
+	 * exactly why we want to be able to monitor whether or not we are
+	 * still here.
+	 */
+	WalSndSetState(WALSNDSTATE_CATCHUP);
 
-	if (cmd->timeline != 0)
-	{
-		XLogRecPtr	switchpoint;
-
-		sendTimeLine = cmd->timeline;
-		if (sendTimeLine == currTLI)
-		{
-			sendTimeLineIsHistoric = false;
-			sendTimeLineValidUpto = InvalidXLogRecPtr;
-		}
-		else
-		{
-			List	   *timeLineHistory;
-
-			sendTimeLineIsHistoric = true;
-
-			/*
-			 * Check that the timeline the client requested exists, and the
-			 * requested start location is on that timeline.
-			 */
-			timeLineHistory = readTimeLineHistory(currTLI);
-			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
-										 &sendTimeLineNextTLI);
-			list_free_deep(timeLineHistory);
-
-			/*
-			 * Found the requested timeline in the history. Check that
-			 * requested startpoint is on that timeline in our history.
-			 *
-			 * This is quite loose on purpose. We only check that we didn't
-			 * fork off the requested timeline before the switchpoint. We
-			 * don't check that we switched *to* it before the requested
-			 * starting point. This is because the client can legitimately
-			 * request to start replication from the beginning of the WAL
-			 * segment that contains switchpoint, but on the new timeline, so
-			 * that it doesn't end up with a partial segment. If you ask for
-			 * too old a starting point, you'll get an error later when we
-			 * fail to find the requested WAL segment in pg_wal.
-			 *
-			 * XXX: we could be more strict here and only allow a startpoint
-			 * that's older than the switchpoint, if it's still in the same
-			 * WAL segment.
-			 */
-			if (!XLogRecPtrIsInvalid(switchpoint) &&
-				switchpoint < cmd->startpoint)
-			{
-				ereport(ERROR,
-						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
-								LSN_FORMAT_ARGS(cmd->startpoint),
-								cmd->timeline),
-						 errdetail("This server's history forked from timeline %u at %X/%X.",
-								   cmd->timeline,
-								   LSN_FORMAT_ARGS(switchpoint))));
-			}
-			sendTimeLineValidUpto = switchpoint;
-		}
-	}
-	else
+	/*
+	 * Don't allow a request to stream from a future point in WAL that
+	 * hasn't been flushed to disk in this server yet.
+	 */
+	if (FlushPtr < cmd->startpoint)
 	{
-		sendTimeLine = currTLI;
-		sendTimeLineValidUpto = InvalidXLogRecPtr;
-		sendTimeLineIsHistoric = false;
+		ereport(ERROR,
+				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+						LSN_FORMAT_ARGS(cmd->startpoint),
+						LSN_FORMAT_ARGS(FlushPtr))));
 	}
 
-	streamingDoneSending = streamingDoneReceiving = false;
+	/* Start streaming from the requested point */
+	sentPtr = cmd->startpoint;
 
-	/* If there is nothing to stream, don't even enter COPY mode */
-	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
-	{
-		/*
-		 * When we first start replication the standby will be behind the
-		 * primary. For some applications, for example synchronous
-		 * replication, it is important to have a clear state for this initial
-		 * catchup mode, so we can trigger actions when we change streaming
-		 * state later. We may stay in this state for a long time, which is
-		 * exactly why we want to be able to monitor whether or not we are
-		 * still here.
-		 */
-		WalSndSetState(WALSNDSTATE_CATCHUP);
-
-		/*
-		 * Don't allow a request to stream from a future point in WAL that
-		 * hasn't been flushed to disk in this server yet.
-		 */
-		if (FlushPtr < cmd->startpoint)
-		{
-			ereport(ERROR,
-					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-							LSN_FORMAT_ARGS(cmd->startpoint),
-							LSN_FORMAT_ARGS(FlushPtr))));
-		}
+	/* Initialize shared memory status, too */
+	SpinLockAcquire(&MyWalSnd->mutex);
+	MyWalSnd->sentPtr = sentPtr;
+	SpinLockRelease(&MyWalSnd->mutex);
 
-		/* Start streaming from the requested point */
-		sentPtr = cmd->startpoint;
+	SyncRepInitConfig();
 
-		/* Initialize shared memory status, too */
-		SpinLockAcquire(&MyWalSnd->mutex);
-		MyWalSnd->sentPtr = sentPtr;
-		SpinLockRelease(&MyWalSnd->mutex);
+	/* Infinite send loop, never returns */
+	WalSndLoop();
 
-		SyncRepInitConfig();
-
-		/* Main loop of walsender */
-		replication_active = true;
-
-		WalSndLoop(XLogSendPhysical);
-
-		replication_active = false;
-		if (got_STOPPING)
-			proc_exit(0);
-		WalSndSetState(WALSNDSTATE_STARTUP);
-
-		Assert(streamingDoneSending && streamingDoneReceiving);
-	}
+	WalSndSetState(WALSNDSTATE_STARTUP);
 
 	if (cmd->slotname)
 		ReplicationSlotRelease();
-
-	/*
-	 * Copy is finished now. Send a single-row result set indicating the next
-	 * timeline.
-	 */
-	if (sendTimeLineIsHistoric)
-	{
-		char		startpos_str[8 + 1 + 8 + 1];
-		DestReceiver *dest;
-		TupOutputState *tstate;
-		TupleDesc	tupdesc;
-		Datum		values[2];
-		bool		nulls[2];
-
-		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
-				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
-
-		dest = CreateDestReceiver(DestRemoteSimple);
-		MemSet(nulls, false, sizeof(nulls));
-
-		/*
-		 * Need a tuple descriptor representing two columns. int8 may seem
-		 * like a surprising data type for this, but in theory int4 would not
-		 * be wide enough for this, as TimeLineID is unsigned.
-		 */
-		tupdesc = CreateTemplateTupleDesc(2);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
-								  INT8OID, -1, 0);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
-								  TEXTOID, -1, 0);
-
-		/* prepare for projection of tuple */
-		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
-
-		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
-		values[1] = CStringGetTextDatum(startpos_str);
-
-		/* send it to dest */
-		do_tup_output(tstate, values, nulls);
-
-		end_tup_output(tstate);
-	}
-
-	/* Send CommandComplete message */
-	EndReplicationCommand("START_STREAMING");
 }
 
-#if PG_VERSION_NUM >= 150000
-static XLogRecPtr
-GetStandbyFlushRecPtr(TimeLineID *tli)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	*tli = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == replayTLI && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-#else
 /*
- * Returns the latest point in WAL that has been safely flushed to disk, and
- * can be sent to the standby. This should only be called when in recovery,
- * ie. we're streaming to a cascaded standby.
- *
- * As a side-effect, ThisTimeLineID is updated to the TLI of the last
- * replayed WAL record.
+ * Main loop that waits for LSN updates and calls the walproposer.
+ * Synchronous replication sets latch in WalSndWakeup at walsender.c
  */
-static XLogRecPtr
-GetStandbyFlushRecPtr(void)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	ThisTimeLineID = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-#endif
-
-
-
-/* XLogReaderRoutine->segment_open callback */
-static void
-WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-				  TimeLineID *tli_p)
-{
-	char		path[MAXPGPATH];
-
-	/*-------
-	 * When reading from a historic timeline, and there is a timeline switch
-	 * within this segment, read from the WAL segment belonging to the new
-	 * timeline.
-	 *
-	 * For example, imagine that this server is currently on timeline 5, and
-	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
-	 * 0/13002088. In pg_wal, we have these files:
-	 *
-	 * ...
-	 * 000000040000000000000012
-	 * 000000040000000000000013
-	 * 000000050000000000000013
-	 * 000000050000000000000014
-	 * ...
-	 *
-	 * In this situation, when requested to send the WAL from segment 0x13, on
-	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
-	 * recovery prefers files from newer timelines, so if the segment was
-	 * restored from the archive on this server, the file belonging to the old
-	 * timeline, 000000040000000000000013, might not exist. Their contents are
-	 * equal up to the switchpoint, because at a timeline switch, the used
-	 * portion of the old segment is copied to the new file.  -------
-	 */
-	*tli_p = sendTimeLine;
-	if (sendTimeLineIsHistoric)
-	{
-		XLogSegNo	endSegNo;
-
-		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
-		if (nextSegNo == endSegNo)
-			*tli_p = sendTimeLineNextTLI;
-	}
-
-	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
-	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (state->seg.ws_file >= 0)
-		return;
-
-	/*
-	 * If the file is not found, assume it's because the standby asked for a
-	 * too old WAL segment that has already been removed or recycled.
-	 */
-	if (errno == ENOENT)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
-		errno = save_errno;
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("requested WAL segment %s has already been removed",
-						xlogfname)));
-	}
-	else
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not open file \"%s\": %m",
-						path)));
-}
-
-
-/* Main loop of walsender process that streams the WAL over Copy messages. */
 static void
-WalSndLoop(WalSndSendDataCallback send_data)
+WalSndLoop(void)
 {
-	/*
-	 * Initialize the last reply timestamp. That enables timeout processing
-	 * from hereon.
-	 */
-	last_reply_timestamp = GetCurrentTimestamp();
-	waiting_for_ping_response = false;
-
-	/*
-	 * Loop until we reach the end of this timeline or the client requests to
-	 * stop streaming.
-	 */
 	for (;;)
 	{
 		/* Clear any already-pending wakeups */
@@ -911,153 +565,41 @@ WalSndLoop(WalSndSendDataCallback send_data)
 
 		CHECK_FOR_INTERRUPTS();
 
-		/* Process any requests or signals received recently */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-			SyncRepInitConfig();
-		}
+		XLogBroadcastWalProposer();
 
-		/* always true */
-		if (am_wal_proposer)
-		{
-			send_data();
-			if (WalSndCaughtUp)
-			{
-				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-					WalSndSetState(WALSNDSTATE_STREAMING);
-				WalProposerPoll();
-				WalSndCaughtUp = false;
-			}
-			continue;
-		}
+		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+			WalSndSetState(WALSNDSTATE_STREAMING);
+		WalProposerPoll();
 	}
 }
 
 /*
- * Send out the WAL in its normal physical/stored form.
- *
- * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
- * but not yet sent to the client, and buffer it in the libpq output
- * buffer.
- *
- * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
- * otherwise WalSndCaughtUp is set to false.
+ * Notify walproposer about the new WAL position.
  */
 static void
-XLogSendPhysical(void)
+XLogBroadcastWalProposer(void)
 {
-	XLogRecPtr	SendRqstPtr;
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
-	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
-	TimeLineID	currTLI;
-
-	/* If requested switch the WAL sender to the stopping state. */
-	if (got_STOPPING)
-		WalSndSetState(WALSNDSTATE_STOPPING);
-
-	if (streamingDoneSending)
-	{
-		WalSndCaughtUp = true;
-		return;
-	}
 
-	/* Figure out how far we can safely send the WAL. */
-	if (sendTimeLineIsHistoric)
-	{
-		/*
-		 * Streaming an old timeline that's in this server's history, but is
-		 * not the one we're currently inserting or replaying. It can be
-		 * streamed up to the point where we switched off that timeline.
-		 */
-		SendRqstPtr = sendTimeLineValidUpto;
-	}
-	else if (am_cascading_walsender)
-	{
-		/*
-		 * Streaming the latest timeline on a standby.
-		 *
-		 * Attempt to send all WAL that has already been replayed, so that we
-		 * know it's valid. If we're receiving WAL through streaming
-		 * replication, it's also OK to send any WAL that has been received
-		 * but not replayed.
-		 *
-		 * The timeline we're recovering from can change, or we can be
-		 * promoted. In either case, the current timeline becomes historic. We
-		 * need to detect that so that we don't try to stream past the point
-		 * where we switched to another timeline. We check for promotion or
-		 * timeline switch after calculating FlushPtr, to avoid a race
-		 * condition: if the timeline becomes historic just after we checked
-		 * that it was still current, it's still be OK to stream it up to the
-		 * FlushPtr that was calculated before it became historic.
-		 */
-		bool		becameHistoric = false;
-#if PG_VERSION_NUM >= 150000
-		SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
-#else
-		SendRqstPtr = GetStandbyFlushRecPtr();
-		currTLI = ThisTimeLineID;
-#endif
-		if (!RecoveryInProgress())
-		{
-			/*
-			 * We have been promoted. RecoveryInProgress() updated
-			 * ThisTimeLineID to the new current timeline.
-			 */
-			am_cascading_walsender = false;
-			becameHistoric = true;
-		}
-		else
-		{
-			/*
-			 * Still a cascading standby. But is the timeline we're sending
-			 * still the one recovery is recovering from? currTLI was updated
-			 * by the GetStandbyFlushRecPtr() call above.
-			 */
-			if (sendTimeLine != currTLI)
-				becameHistoric = true;
-		}
-
-		if (becameHistoric)
-		{
-			/*
-			 * The timeline we were sending has become historic. Read the
-			 * timeline history file of the new timeline to see where exactly
-			 * we forked off from the timeline we were sending.
-			 */
-			List	   *history;
-
-			history = readTimeLineHistory(currTLI);
-			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
-
-			Assert(sendTimeLine < sendTimeLineNextTLI);
-			list_free_deep(history);
-
-			sendTimeLineIsHistoric = true;
+	/* Start from the last sent position */
+	startptr = sentPtr;
 
-			SendRqstPtr = sendTimeLineValidUpto;
-		}
-	}
-	else
-	{
-		/*
-		 * Streaming the current timeline on a primary.
-		 *
-		 * Attempt to send all data that's already been written out and
-		 * fsync'd to disk.  We cannot go further than what's been written out
-		 * given the current implementation of WALRead().  And in any case
-		 * it's unsafe to send WAL that is not securely down to disk on the
-		 * primary: if the primary subsequently crashes and restarts, standbys
-		 * must not have applied any WAL that got lost on the primary.
-		 */
+	/*
+	 * Streaming the current timeline on a primary.
+	 *
+	 * Attempt to send all data that's already been written out and
+	 * fsync'd to disk.  We cannot go further than what's been written out
+	 * given the current implementation of WALRead().  And in any case
+	 * it's unsafe to send WAL that is not securely down to disk on the
+	 * primary: if the primary subsequently crashes and restarts, standbys
+	 * must not have applied any WAL that got lost on the primary.
+	 */
 #if PG_VERSION_NUM >= 150000
-		SendRqstPtr = GetFlushRecPtr(NULL);
+	endptr = GetFlushRecPtr(NULL);
 #else
-		SendRqstPtr = GetFlushRecPtr();
+	endptr = GetFlushRecPtr();
 #endif
-	}
 
 	/*
 	 * Record the current system time as an approximation of the time at which
@@ -1083,91 +625,14 @@ XLogSendPhysical(void)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
-
-	/*
-	 * If this is a historic timeline and we've reached the point where we
-	 * forked to the next timeline, stop streaming.
-	 *
-	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
-	 * startup process will normally replay all WAL that has been received
-	 * from the primary, before promoting, but if the WAL streaming is
-	 * terminated at a WAL page boundary, the valid portion of the timeline
-	 * might end in the middle of a WAL record. We might've already sent the
-	 * first half of that partial WAL record to the cascading standby, so that
-	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
-	 * replay the partial WAL record either, so it can still follow our
-	 * timeline switch.
-	 */
-	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
-	{
-		/* close the current file. */
-		if (xlogreader->seg.ws_file >= 0)
-			wal_segment_close(xlogreader);
-
-		/* Send CopyDone */
-		pq_putmessage_noblock('c', NULL, 0);
-		streamingDoneSending = true;
-
-		WalSndCaughtUp = true;
-
-		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
-			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
-			 LSN_FORMAT_ARGS(sentPtr));
-		return;
-	}
+	LagTrackerWrite(endptr, GetCurrentTimestamp());
 
 	/* Do we have any work to do? */
-	Assert(sentPtr <= SendRqstPtr);
-	if (SendRqstPtr <= sentPtr)
-	{
-		WalSndCaughtUp = true;
+	Assert(startptr <= endptr);
+	if (endptr <= startptr)
 		return;
-	}
-
-	/*
-	 * Figure out how much to send in one message. If there's no more than
-	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
-	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
-	 *
-	 * The rounding is not only for performance reasons. Walreceiver relies on
-	 * the fact that we never split a WAL record across two messages. Since a
-	 * long WAL record is split at page boundary into continuation records,
-	 * page boundary is always a safe cut-off point. We also assume that
-	 * SendRqstPtr never points to the middle of a WAL record.
-	 */
-	startptr = sentPtr;
-	endptr = startptr;
-	endptr += MAX_SEND_SIZE;
 
-	/* if we went beyond SendRqstPtr, back off */
-	if (SendRqstPtr <= endptr)
-	{
-		endptr = SendRqstPtr;
-		if (sendTimeLineIsHistoric)
-			WalSndCaughtUp = false;
-		else
-			WalSndCaughtUp = true;
-	}
-	else
-	{
-		/* round down to page boundary. */
-		endptr -= (endptr % XLOG_BLCKSZ);
-		WalSndCaughtUp = false;
-	}
-
-	nbytes = endptr - startptr;
-	Assert(nbytes <= MAX_SEND_SIZE);
-
-	/* always true */
-	if (am_wal_proposer)
-	{
-		WalProposerBroadcast(startptr, endptr);
-	}
-	else
-	{
-		/* code removed for brevity */
-	}
+	WalProposerBroadcast(startptr, endptr);
 	sentPtr = endptr;
 
 	/* Update shared memory status */

From d3a97fdf881e69636f29f462efb0a3b799b854c8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 14 Aug 2023 10:18:22 +0100
Subject: [PATCH 17/22] pageserver: avoid incrementing access time when reading
 layers for compaction (#4971)

## Problem

Currently, image generation reads delta layers before writing out
subsequent image layers, which updates the access time of the delta
layers and effectively puts them at the back of the queue for eviction.
This is the opposite of what we want, because after a delta layer is
covered by a later image layer, it's likely that subsequent reads of
latest data will hit the image rather than the delta layer, so the delta
layer should be quite a good candidate for eviction.

## Summary of changes

`RequestContext` gets a new `ATimeBehavior` field, and a
`RequestContextBuilder` helper so that we can optionally add the new
field without growing `RequestContext::new` every time we add something
like this.

Request context is passed into the `record_access` function, and the
access time is not updated if `ATimeBehavior::Skip` is set.

The compaction background task constructs its request context with this
skip policy.

Closes: https://github.com/neondatabase/neon/issues/4969
---
 pageserver/src/context.rs                     | 79 +++++++++++++++++--
 pageserver/src/tenant/storage_layer.rs        | 12 ++-
 .../src/tenant/storage_layer/delta_layer.rs   |  3 +-
 .../src/tenant/storage_layer/image_layer.rs   |  3 +-
 pageserver/src/tenant/timeline.rs             | 11 ++-
 5 files changed, 90 insertions(+), 18 deletions(-)

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index a1a5c30ae9aeb..2953208d1e9f2 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -85,6 +85,7 @@
 //! The solution is that all code paths are infected with precisely one
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
+
 use crate::task_mgr::TaskKind;
 
 // The main structure of this module, see module-level comment.
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
 pub struct RequestContext {
     task_kind: TaskKind,
     download_behavior: DownloadBehavior,
+    access_stats_behavior: AccessStatsBehavior,
 }
 
 /// Desired behavior if the operation requires an on-demand download
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
     Error,
 }
 
+/// Whether this request should update access times used in LRU eviction
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum AccessStatsBehavior {
+    /// Update access times: this request's access to data should be taken
+    /// as a hint that the accessed layer is likely to be accessed again
+    Update,
+
+    /// Do not update access times: this request is accessing the layer
+    /// but does not want to indicate that the layer should be retained in cache,
+    /// perhaps because the requestor is a compaction routine that will soon cover
+    /// this layer with another.
+    Skip,
+}
+
+pub struct RequestContextBuilder {
+    inner: RequestContext,
+}
+
+impl RequestContextBuilder {
+    /// A new builder with default settings
+    pub fn new(task_kind: TaskKind) -> Self {
+        Self {
+            inner: RequestContext {
+                task_kind,
+                download_behavior: DownloadBehavior::Download,
+                access_stats_behavior: AccessStatsBehavior::Update,
+            },
+        }
+    }
+
+    pub fn extend(original: &RequestContext) -> Self {
+        Self {
+            // This is like a Copy, but avoid implementing Copy because ordinary users of
+            // RequestContext should always move or ref it.
+            inner: RequestContext {
+                task_kind: original.task_kind,
+                download_behavior: original.download_behavior,
+                access_stats_behavior: original.access_stats_behavior,
+            },
+        }
+    }
+
+    /// Configure the DownloadBehavior of the context: whether to
+    /// download missing layers, and/or warn on the download.
+    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
+        self.inner.download_behavior = b;
+        self
+    }
+
+    /// Configure the AccessStatsBehavior of the context: whether layer
+    /// accesses should update the access time of the layer.
+    pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
+        self.inner.access_stats_behavior = b;
+        self
+    }
+
+    pub fn build(self) -> RequestContext {
+        self.inner
+    }
+}
+
 impl RequestContext {
     /// Create a new RequestContext that has no parent.
     ///
@@ -123,10 +186,9 @@ impl RequestContext {
     /// because someone explicitly canceled it.
     /// It has no parent, so it cannot inherit cancellation from there.
     pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        RequestContextBuilder::new(task_kind)
+            .download_behavior(download_behavior)
+            .build()
     }
 
     /// Create a detached child context for a task that may outlive `self`.
@@ -187,10 +249,7 @@ impl RequestContext {
     }
 
     fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        Self::new(task_kind, download_behavior)
     }
 
     pub fn task_kind(&self) -> TaskKind {
@@ -200,4 +259,8 @@ impl RequestContext {
     pub fn download_behavior(&self) -> DownloadBehavior {
         self.download_behavior
     }
+
+    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
+        self.access_stats_behavior
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 951fa38d8d49c..4107161aa1839 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,7 +8,7 @@ mod layer_desc;
 mod remote_layer;
 
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
@@ -241,10 +241,14 @@ impl LayerAccessStats {
         });
     }
 
-    fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
+    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
+        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
+            return;
+        }
+
         let this_access = LayerAccessStatFullDetails {
             when: SystemTime::now(),
-            task_kind,
+            task_kind: ctx.task_kind(),
             access_kind,
         };
 
@@ -252,7 +256,7 @@ impl LayerAccessStats {
         locked.iter_mut().for_each(|inner| {
             inner.first_access.get_or_insert(this_access);
             inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= task_kind;
+            inner.task_kind_flag |= ctx.task_kind();
             inner.last_accesses.write(this_access);
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 5208fdb7b38d1..9a7108c4c8653 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -452,8 +452,7 @@ impl DeltaLayer {
         access_kind: LayerAccessKind,
         ctx: &RequestContext,
     ) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
         // Quick exit if already loaded
         self.inner
             .get_or_try_init(|| self.load_inner())
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 2824abba754b4..2d5ef402ae2dd 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -323,8 +323,7 @@ impl ImageLayer {
         access_kind: LayerAccessKind,
         ctx: &RequestContext,
     ) -> Result<&ImageLayerInner> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
         self.inner
             .get_or_try_init(|| self.load_inner())
             .await
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2a5db0516a968..95e86c6bcc4c4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -35,7 +35,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
+};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
     DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
@@ -799,10 +801,15 @@ impl Timeline {
             .await
         {
             Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
                 let layer_paths_to_upload = self
-                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
                     .await
                     .map_err(anyhow::Error::from)?;
                 if let Some(remote_client) = &self.remote_client {

From 49c57c0b13b0a2dfa28ec17c82cc7fe6e10b2479 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sun, 13 Aug 2023 15:23:47 +0300
Subject: [PATCH 18/22] Add neon_local to docker image.

People sometimes ask about this.

https://community.neon.tech/t/is-the-neon-local-binary-in-any-of-the-official-docker-images/360/2
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 9467e41ae40f1..1c447b2db99a3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -51,6 +51,7 @@ RUN set -e \
       --bin safekeeper  \
       --bin storage_broker  \
       --bin proxy  \
+      --bin neon_local \
       --locked --release \
     && cachepot -s
 
@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
 
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

From 4626d89edaa004a3fb86f241169da07d333e6fad Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Mon, 14 Aug 2023 17:16:49 +0300
Subject: [PATCH 19/22] Harden retries on tenant/timeline deletion path.
 (#4973)

Originated from test failure where we got SlowDown error from s3.
The patch generalizes `download_retry` to not be download specific.
Resulting `retry` function is moved to utils crate. `download_retries`
is now a thin wrapper around this `retry` function.

To ensure that all needed retries are in place test code now uses
`test_remote_failures=1` setting.

Ref https://neondb.slack.com/archives/C059ZC138NR/p1691743624353009
---
 libs/utils/src/backoff.rs                     | 188 ++++++++++++++++++
 libs/utils/src/lib.rs                         |   2 +
 pageserver/src/lib.rs                         |  53 -----
 pageserver/src/tenant/delete.rs               |  79 +++++---
 .../src/tenant/remote_timeline_client.rs      |  72 +++++--
 .../tenant/remote_timeline_client/download.rs |  55 +----
 .../walreceiver/connection_manager.rs         |   4 +-
 test_runner/fixtures/pageserver/utils.py      |   2 +-
 test_runner/regress/test_remote_storage.py    |   6 +-
 test_runner/regress/test_tenant_delete.py     |  18 +-
 10 files changed, 324 insertions(+), 155 deletions(-)
 create mode 100644 libs/utils/src/backoff.rs

diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs
new file mode 100644
index 0000000000000..4507efe64701c
--- /dev/null
+++ b/libs/utils/src/backoff.rs
@@ -0,0 +1,188 @@
+use std::fmt::{Debug, Display};
+
+use futures::Future;
+
+pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        tracing::info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+    }
+}
+
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// `description` argument is added to log messages. Its value should identify the `op` is doing
+pub async fn retry<T, O, F, E>(
+    mut op: O,
+    is_permanent: impl Fn(&E) -> bool,
+    warn_threshold: u32,
+    max_retries: u32,
+    description: &str,
+) -> Result<T, E>
+where
+    // Not std::error::Error because anyhow::Error doesnt implement it.
+    // For context see https://github.com/dtolnay/anyhow/issues/63
+    E: Display + Debug,
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, E>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    tracing::info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(ref e) if is_permanent(e) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(err) if attempts < warn_threshold => {
+                tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(err) if attempts < max_retries => {
+                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(ref err) => {
+                // Operation failed `max_attempts` times. Time to give up.
+                tracing::warn!(
+                    "{description} still failed after {attempts} retries, giving up: {err:?}"
+                );
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io;
+
+    use tokio::sync::Mutex;
+
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_always_error() {
+        let count = Mutex::new(0);
+        let err_result = retry(
+            || async {
+                *count.lock().await += 1;
+                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
+            },
+            |_e| false,
+            1,
+            1,
+            "work",
+        )
+        .await;
+
+        assert!(err_result.is_err());
+
+        assert_eq!(*count.lock().await, 2);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_ok_after_err() {
+        let count = Mutex::new(0);
+        retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| false,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn dont_retry_permanent_errors() {
+        let count = Mutex::new(0);
+        let _ = retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| true,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap_err();
+
+        assert_eq!(*count.lock().await, 1);
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index b591cc611aa52..5c8e01f41a513 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
 
+pub mod backoff;
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index b918dff193d0d..4ff8ffcc4fc8b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -95,28 +95,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
     std::process::exit(exit_code);
 }
 
-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
-    let backoff_duration_seconds =
-        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
-    if backoff_duration_seconds > 0.0 {
-        info!(
-            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
-        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
-    }
-}
-
-pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
-    if n == 0 {
-        0.0
-    } else {
-        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
-    }
-}
-
 /// The name of the metadata file pageserver creates per timeline.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -238,37 +216,6 @@ async fn timed<Fut: std::future::Future>(
     }
 }
 
-#[cfg(test)]
-mod backoff_defaults_tests {
-    use super::*;
-
-    #[test]
-    fn backoff_defaults_produce_growing_backoff_sequence() {
-        let mut current_backoff_value = None;
-
-        for i in 0..10_000 {
-            let new_backoff_value = exponential_backoff_duration_seconds(
-                i,
-                DEFAULT_BASE_BACKOFF_SECONDS,
-                DEFAULT_MAX_BACKOFF_SECONDS,
-            );
-
-            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
-                assert!(
-                    old_backoff_value <= new_backoff_value,
-                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
-                )
-            }
-        }
-
-        assert_eq!(
-            current_backoff_value.expect("Should have produced backoff values to compare"),
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            "Given big enough of retries, backoff should reach its allowed max value"
-        );
-    }
-}
-
 #[cfg(test)]
 mod timed_tests {
     use super::timed;
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index bdeb117c7953d..38fc31f69c1cc 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -10,7 +10,7 @@ use tokio::sync::OwnedMutexGuard;
 use tracing::{error, info, instrument, warn, Instrument, Span};
 
 use utils::{
-    completion, crashsafe, fs_ext,
+    backoff, completion, crashsafe, fs_ext,
     id::{TenantId, TimelineId},
 };
 
@@ -23,12 +23,13 @@ use crate::{
 
 use super::{
     mgr::{GetTenantError, TenantsMap},
+    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
     span,
     timeline::delete::DeleteTimelineFlow,
     tree_sort_timelines, DeleteTimelineError, Tenant,
 };
 
-const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3;
+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
 
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTenantError {
@@ -71,10 +72,19 @@ async fn create_remote_delete_mark(
     let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
 
     let data: &[u8] = &[];
-    remote_storage
-        .upload(data, 0, &remote_mark_path, None)
-        .await
-        .context("mark upload")?;
+    backoff::retry(
+        || async {
+            remote_storage
+                .upload(data, 0, &remote_mark_path, None)
+                .await
+        },
+        |_e| false,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "mark_upload",
+    )
+    .await
+    .context("mark_upload")?;
 
     Ok(())
 }
@@ -154,9 +164,16 @@ async fn remove_tenant_remote_delete_mark(
     tenant_id: &TenantId,
 ) -> Result<(), DeleteTenantError> {
     if let Some(remote_storage) = remote_storage {
-        remote_storage
-            .delete(&remote_tenant_delete_mark_path(conf, tenant_id)?)
-            .await?;
+        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+        backoff::retry(
+            || async { remote_storage.delete(&path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_tenant_remote_delete_mark",
+        )
+        .await
+        .context("remove_tenant_remote_delete_mark")?;
     }
     Ok(())
 }
@@ -337,32 +354,28 @@ impl DeleteTenantFlow {
             return Ok(acquire(tenant));
         }
 
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };
+
         // If remote storage is there we rely on it
-        if let Some(remote_storage) = remote_storage {
-            let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
-
-            let attempt = 1;
-            loop {
-                match remote_storage.download(&remote_mark_path).await {
-                    Ok(_) => return Ok(acquire(tenant)),
-                    Err(e) => {
-                        if matches!(e, DownloadError::NotFound) {
-                            return Ok(None);
-                        }
-                        if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS {
-                            return Err(anyhow::anyhow!(e))?;
-                        }
-
-                        warn!(
-                            "failed to fetch tenant deletion mark at {} attempt {}",
-                            &remote_mark_path, attempt
-                        )
-                    }
-                }
-            }
-        }
+        let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
+
+        let result = backoff::retry(
+            || async { remote_storage.download(&remote_mark_path).await },
+            |e| matches!(e, DownloadError::NotFound),
+            SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+            SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+            "fetch_tenant_deletion_mark",
+        )
+        .await;
 
-        Ok(None)
+        match result {
+            Ok(_) => Ok(acquire(tenant)),
+            Err(DownloadError::NotFound) => Ok(None),
+            Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?,
+        }
     }
 
     pub(crate) async fn resume(
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 8d002a8570617..1d04d748396b1 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
+use utils::backoff::{
+    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 
 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
@@ -241,7 +244,6 @@ use crate::{
     tenant::upload_queue::{
         UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
     },
-    {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };
 
 use utils::id::{TenantId, TimelineId};
@@ -256,12 +258,12 @@ use super::upload_queue::SetDeletedFlagProgress;
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_DOWNLOAD_RETRIES times, we give up
-const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
-const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
 
 // Similarly log failed uploads and deletions at WARN level, after this many
 // retries. Uploads and deletions are retried forever, though.
-const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
@@ -752,12 +754,24 @@ impl RemoteTimelineClient {
 
         pausable_failpoint!("persist_deleted_index_part");
 
-        upload::upload_index_part(
-            self.conf,
-            &self.storage_impl,
-            &self.tenant_id,
-            &self.timeline_id,
-            &index_part_with_deleted_at,
+        backoff::retry(
+            || async {
+                upload::upload_index_part(
+                    self.conf,
+                    &self.storage_impl,
+                    &self.tenant_id,
+                    &self.timeline_id,
+                    &index_part_with_deleted_at,
+                )
+                .await
+            },
+            |_e| false,
+            1,
+            // have just a couple of attempts
+            // when executed as part of timeline deletion this happens in context of api call
+            // when executed as part of tenant deletion this happens in the background
+            2,
+            "persist_index_part_with_deleted_flag",
         )
         .await?;
 
@@ -834,10 +848,19 @@ impl RemoteTimelineClient {
         let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
         let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
 
-        let remaining = self
-            .storage_impl
-            .list_prefixes(Some(&timeline_storage_path))
-            .await?;
+        let remaining = backoff::retry(
+            || async {
+                self.storage_impl
+                    .list_prefixes(Some(&timeline_storage_path))
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "list_prefixes",
+        )
+        .await
+        .context("list prefixes")?;
 
         let remaining: Vec<RemotePath> = remaining
             .into_iter()
@@ -852,7 +875,15 @@ impl RemoteTimelineClient {
             .collect();
 
         if !remaining.is_empty() {
-            self.storage_impl.delete_objects(&remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+            )
+            .await
+            .context("delete_objects")?;
         }
 
         fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -864,7 +895,16 @@ impl RemoteTimelineClient {
         let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
 
         debug!("deleting index part");
-        self.storage_impl.delete(&index_file_path).await?;
+
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+        )
+        .await
+        .context("delete_index")?;
 
         fail::fail_point!("timeline-delete-after-index-delete", |_| {
             Err(anyhow::anyhow!(
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 64f4a0a113675..0a6fd03887405 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,19 +11,17 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-
-use tracing::{info, warn};
+use utils::backoff;
 
 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
 use super::index::{IndexPart, LayerFileMetadata};
-use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
+use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
     fs::File::open(path).await?.sync_all().await
@@ -268,7 +266,6 @@ pub(super) async fn download_index_part(
     Ok(index_part)
 }
 
-///
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -276,47 +273,17 @@ pub(super) async fn download_index_part(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
 where
     O: FnMut() -> F,
     F: Future<Output = Result<T, DownloadError>>,
 {
-    let mut attempts = 0;
-    loop {
-        let result = op().await;
-        match result {
-            Ok(_) => {
-                if attempts > 0 {
-                    info!("{description} succeeded after {attempts} retries");
-                }
-                return result;
-            }
-
-            // These are "permanent" errors that should not be retried.
-            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
-                return result;
-            }
-            // Assume that any other failure might be transient, and the operation might
-            // succeed if we just keep trying.
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
-                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
-                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(ref err)) => {
-                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
-                return result;
-            }
-        }
-        // sleep and retry
-        exponential_backoff(
-            attempts,
-            DEFAULT_BASE_BACKOFF_SECONDS,
-            DEFAULT_MAX_BACKOFF_SECONDS,
-        )
-        .await;
-        attempts += 1;
-    }
+    backoff::retry(
+        op,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        description,
+    )
+    .await
 }
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index e96aa41da0b79..d52b29f1756ef 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -31,8 +31,10 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;
 
-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
+use utils::backoff::{
+    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 use utils::{
     id::{NodeId, TenantTimelineId},
     lsn::Lsn,
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index f5859550accae..6032ff5b6810f 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -284,4 +284,4 @@ def tenant_delete_wait_completed(
 
 
 def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
-    return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6
+    return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 8
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index bfe9046ecabfc..d642a6d19093c 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -173,9 +173,7 @@ def test_remote_storage_backup_and_restore(
     #
     # The initiated attach operation should survive the restart, and continue from where it was.
     env.pageserver.stop()
-    layer_download_failed_regex = (
-        r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
-    )
+    layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+"
     assert not env.pageserver.log_contains(
         layer_download_failed_regex
     ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
@@ -208,7 +206,7 @@ def test_remote_storage_backup_and_restore(
                 == f"{data}|{checkpoint_number}"
             )
 
-    log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
+    log.info("ensure that we needed to retry downloads due to test_remote_failures=1")
     assert env.pageserver.log_contains(layer_download_failed_regex)
 
 
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 06189053316d8..f0f6cc743c180 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -32,6 +32,8 @@ def test_tenant_delete_smoke(
     remote_storage_kind: RemoteStorageKind,
     pg_bin: PgBin,
 ):
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
     neon_env_builder.enable_remote_storage(
         remote_storage_kind=remote_storage_kind,
         test_name="test_tenant_delete_smoke",
@@ -123,25 +125,35 @@ def combinations():
 
     for remote_storage_kind in remotes:
         for delete_failpoint in FAILPOINTS:
-            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
+            if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
                 "timeline-delete-before-index-delete",
             ):
                 # the above failpoint are not relevant for config without remote storage
                 continue
 
-            result.append((remote_storage_kind, delete_failpoint))
+            # Simulate failures for only one type of remote storage
+            # to avoid log pollution and make tests run faster
+            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+                simulate_failures = True
+            else:
+                simulate_failures = False
+            result.append((remote_storage_kind, delete_failpoint, simulate_failures))
     return result
 
 
-@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
+@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
 @pytest.mark.parametrize("check", list(Check))
 def test_delete_tenant_exercise_crash_safety_failpoints(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
     failpoint: str,
+    simulate_failures: bool,
     check: Check,
     pg_bin: PgBin,
 ):
+    if simulate_failures:
+        neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
     neon_env_builder.enable_remote_storage(
         remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
     )

From 1ca08cc52376845e8c908b896e7c9350ddcf1797 Mon Sep 17 00:00:00 2001
From: George MacKerron <george@mackerron.co.uk>
Date: Mon, 14 Aug 2023 16:07:33 +0100
Subject: [PATCH 20/22] Changed batch query body to from [...] to { queries:
 [...] } (#4975)

## Problem

It's nice if `single query : single response :: batch query : batch
response`.

But at present, in the single case we send `{ query: '', params: [] }`
and get back a single `{ rows: [], ... }` object, while in the batch
case we send an array of `{ query: '', params: [] }` objects and get
back not an array of `{ rows: [], ... }` objects but a `{ results: [ {
rows: [] , ... }, { rows: [] , ... }, ... ] }` object instead.

## Summary of changes

With this change, the batch query body becomes `{ queries: [{ query: '',
params: [] }, ... ] }`, which restores a consistent relationship between
the request and response bodies.
---
 proxy/src/http/sql_over_http.rs   | 11 ++++++++---
 test_runner/regress/test_proxy.py |  4 +++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 736413c85c3b1..aa06a91bbae9d 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -27,11 +27,16 @@ struct QueryData {
     params: Vec<serde_json::Value>,
 }
 
+#[derive(serde::Deserialize)]
+struct BatchQueryData {
+    queries: Vec<QueryData>,
+}
+
 #[derive(serde::Deserialize)]
 #[serde(untagged)]
 enum Payload {
     Single(QueryData),
-    Batch(Vec<QueryData>),
+    Batch(BatchQueryData),
 }
 
 pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
@@ -233,7 +238,7 @@ pub async fn handle(
         Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
             .await
             .map(|x| (x, HashMap::default())),
-        Payload::Batch(queries) => {
+        Payload::Batch(batch_query) => {
             let mut results = Vec::new();
             let mut builder = client.build_transaction();
             if let Some(isolation_level) = txn_isolation_level {
@@ -243,7 +248,7 @@ pub async fn handle(
                 builder = builder.read_only(true);
             }
             let transaction = builder.start().await?;
-            for query in queries {
+            for query in batch_query.queries {
                 let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                 match result {
                     Ok(r) => results.push(r),
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 35334ec7b245b..fabec6b5bc3e0 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -269,7 +269,9 @@ def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False)
         connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
         response = requests.post(
             f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
-            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
+            data=json.dumps(
+                {"queries": list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))}
+            ),
             headers={
                 "Content-Type": "application/sql",
                 "Neon-Connection-String": connstr,

From ef4a76c01e7d0925ea6c510b6cb7e15715303751 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 14 Aug 2023 10:19:45 -0500
Subject: [PATCH 21/22] Update Postgres to v15.4 and v14.9 (#4965)

---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 28bf5ccfa2fda..5d5cfee12783f 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 28bf5ccfa2fda9677566a25abd450e714d9ed055
+Subproject commit 5d5cfee12783f0989a9c9fe13bb40b5585812568
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 553f2d3618a6d..026d6b093d49e 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 553f2d3618a6d4893bde67f1c065926ee8a3a118
+Subproject commit 026d6b093d49e25cec44dd04598152329ceac027
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 80d161938effe..63b72cf5063f0 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118",
-    "postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055"
+    "postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027",
+    "postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568"
 }

From ce7efbe48a4dbffdd34cb0aeee03dcd367707301 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 14 Aug 2023 17:20:37 +0200
Subject: [PATCH 22/22] Turn BlockCursor::{read_blob,read_blob_into_buf} async
 fn (#4905)

## Problem

The `BlockCursor::read_blob` and `BlockCursor::read_blob_into_buf`
functions are calling `read_blk` internally, so if we want to make that
function async fn, they need to be async themselves.

## Summary of changes

* We first turn `ValueRef::load` into an async fn.
* Then, we switch the `RwLock` implementation in `InMemoryLayer` to use
the one from `tokio`.
* Last, we convert the `read_blob` and `read_blob_into_buf` functions
into async fn.

In three instances we use `Handle::block_on`:

* one use is in compaction code, which currently isn't async. We put the
entire loop into an `async` block to prevent the potentially hot loop
from doing cross-thread operations.
* one use is in dumping code for `DeltaLayer`. The "proper" way to
address this would be to enable the visit function to take async
closures, but then we'd need to be generic over async fs non async,
which [isn't supported by rust right
now](https://blog.rust-lang.org/inside-rust/2022/07/27/keyword-generics.html).
The other alternative would be to do a first pass where we cache the
data into memory, and only then to dump it.
* the third use is in writing code, inside a loop that copies from one
file to another. It is is synchronous and we'd like to keep it that way
(for now?).

Part of #4743
---
 pageserver/ctl/src/layers.rs                  |   2 +-
 pageserver/src/tenant/blob_io.rs              |   6 +-
 pageserver/src/tenant/ephemeral_file.rs       |  23 ++-
 .../src/tenant/storage_layer/delta_layer.rs   |  24 ++-
 .../src/tenant/storage_layer/image_layer.rs   |   1 +
 .../tenant/storage_layer/inmemory_layer.rs    |  28 +--
 pageserver/src/tenant/timeline.rs             | 189 ++++++++++--------
 .../src/tenant/timeline/layer_manager.rs      |   4 +-
 8 files changed, 152 insertions(+), 125 deletions(-)

diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index e3ddefc661978..2d178a42bd2d5 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -72,7 +72,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
         .await?;
     let cursor = BlockCursor::new(&file);
     for (k, v) in all {
-        let value = cursor.read_blob(v.pos())?;
+        let value = cursor.read_blob(v.pos()).await?;
         println!("key:{} value_len:{}", k, value.len());
     }
     // TODO(chi): special handling for last key?
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 7dd53407e73f9..fd211e8276606 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,14 +21,14 @@ where
     R: BlockReader,
 {
     /// Read a blob into a new buffer.
-    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
         let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf)?;
+        self.read_blob_into_buf(offset, &mut buf).await?;
         Ok(buf)
     }
     /// Read blob into the given buffer. Any previous contents in the buffer
     /// are overwritten.
-    pub fn read_blob_into_buf(
+    pub async fn read_blob_into_buf(
         &self,
         offset: u64,
         dstbuf: &mut Vec<u8>,
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index b088a3b6028ee..38a491f931c6a 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -401,17 +401,26 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+    #[tokio::test]
+    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
         let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
 
         let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
 
         let pos_foo = file.write_blob(b"foo")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
         let pos_bar = file.write_blob(b"bar")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
-        assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
+        assert_eq!(
+            b"bar",
+            file.block_cursor().read_blob(pos_bar).await?.as_slice()
+        );
 
         let mut blobs = Vec::new();
         for i in 0..10000 {
@@ -428,7 +437,7 @@ mod tests {
 
         let cursor = BlockCursor::new(&file);
         for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos)?;
+            let actual = cursor.read_blob(pos).await?;
             assert_eq!(actual, expected);
         }
 
@@ -437,7 +446,7 @@ mod tests {
         large_data.resize(20000, 0);
         thread_rng().fill_bytes(&mut large_data);
         let pos_large = file.write_blob(&large_data)?;
-        let result = file.block_cursor().read_blob(pos_large)?;
+        let result = file.block_cursor().read_blob(pos_large).await?;
         assert_eq!(result, large_data);
 
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 9a7108c4c8653..a0562ddc10898 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -51,6 +51,7 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use tokio::runtime::Handle;
 use tokio::sync::OnceCell;
 use tracing::*;
 
@@ -280,7 +281,8 @@ impl Layer for DeltaLayer {
 
         // A subroutine to dump a single blob
         let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
-            let buf = cursor.read_blob(blob_ref.pos())?;
+            // TODO this is not ideal, but on the other hand we are in dumping code...
+            let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?;
             let val = Value::des(&buf)?;
             let desc = match val {
                 Value::Image(img) => {
@@ -335,7 +337,6 @@ impl Layer for DeltaLayer {
         let inner = self
             .load(LayerAccessKind::GetValueReconstructData, ctx)
             .await?;
-
         inner
             .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
             .await
@@ -912,12 +913,15 @@ impl DeltaLayerInner {
         let cursor = file.block_cursor();
         let mut buf = Vec::new();
         for (entry_lsn, pos) in offsets {
-            cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
-                format!(
-                    "Failed to read blob from virtual file {}",
-                    file.file.path.display()
-                )
-            })?;
+            cursor
+                .read_blob_into_buf(pos, &mut buf)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
             let val = Value::des(&buf).with_context(|| {
                 format!(
                     "Failed to deserialize file blob from virtual file {}",
@@ -1026,9 +1030,9 @@ pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
 
 impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
     /// Loads the value from disk
-    pub fn load(&self) -> Result<Value> {
+    pub async fn load(&self) -> Result<Value> {
         // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos())?;
+        let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
         let val = Value::des(&buf)?;
         Ok(val)
     }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 2d5ef402ae2dd..3fe600e5fffae 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -470,6 +470,7 @@ impl ImageLayerInner {
             let blob = file
                 .block_cursor()
                 .read_blob(offset)
+                .await
                 .with_context(|| format!("failed to read value from offset {}", offset))?;
             let value = Bytes::from(blob);
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 5b825eee6cfac..aa9d0884e000c 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -28,7 +28,7 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::sync::RwLock;
+use tokio::sync::RwLock;
 
 use super::{DeltaLayer, DeltaLayerWriter, Layer};
 
@@ -125,7 +125,7 @@ impl Layer for InMemoryLayer {
 
     /// debugging function to print out the contents of the layer
     async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
 
         let end_str = self.end_lsn_or_max();
 
@@ -143,7 +143,7 @@ impl Layer for InMemoryLayer {
         for (key, vec_map) in inner.index.iter() {
             for (lsn, pos) in vec_map.as_slice() {
                 let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                 let val = Value::des(&buf);
                 match val {
                     Ok(Value::Image(img)) => {
@@ -181,7 +181,7 @@ impl Layer for InMemoryLayer {
         ensure!(lsn_range.start >= self.start_lsn);
         let mut need_image = true;
 
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
 
         let reader = inner.file.block_cursor();
 
@@ -189,7 +189,7 @@ impl Layer for InMemoryLayer {
         if let Some(vec_map) = inner.index.get(&key) {
             let slice = vec_map.slice_range(lsn_range);
             for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos)?;
+                let buf = reader.read_blob(*pos).await?;
                 let value = Value::des(&buf)?;
                 match value {
                     Value::Image(img) => {
@@ -232,8 +232,8 @@ impl InMemoryLayer {
     ///
     /// Get layer size on the disk
     ///
-    pub fn size(&self) -> Result<u64> {
-        let inner = self.inner.read().unwrap();
+    pub async fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().await;
         Ok(inner.file.size)
     }
 
@@ -267,9 +267,9 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().unwrap();
+        let mut inner = self.inner.write().await;
         self.assert_writable();
 
         let off = {
@@ -301,8 +301,8 @@ impl InMemoryLayer {
     /// Make the layer non-writeable. Only call once.
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
-    pub fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().unwrap();
+    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;
 
         assert!(self.start_lsn < end_lsn);
         self.end_lsn.set(end_lsn).expect("end_lsn set only once");
@@ -317,7 +317,7 @@ impl InMemoryLayer {
     /// Write this frozen in-memory layer to disk.
     ///
     /// Returns a new delta layer with all the same data as this in-memory layer
-    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
         // write lock on it, so we shouldn't block anyone. There's one exception
@@ -327,7 +327,7 @@ impl InMemoryLayer {
         // lock, it will see that it's not writeable anymore and retry, but it
         // would have to wait until we release it. That race condition is very
         // rare though, so we just accept the potential latency hit for now.
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
 
         let end_lsn = *self.end_lsn.get().unwrap();
 
@@ -350,7 +350,7 @@ impl InMemoryLayer {
             let key = **key;
             // Write all page versions
             for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                 let will_init = Value::des(&buf)?.will_init();
                 delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
             }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 95e86c6bcc4c4..2deeacdc64e84 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -882,7 +882,7 @@ impl Timeline {
             let Some(open_layer) = layers.open_layer.as_ref() else {
                 return Ok(());
             };
-            open_layer.size()?
+            open_layer.size().await?
         };
         let last_freeze_at = self.last_freeze_at.load();
         let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
@@ -2654,7 +2654,7 @@ impl Timeline {
     async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
         //info!("PUT: key {} at {}", key, lsn);
         let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val)?;
+        layer.put_value(key, lsn, val).await?;
         Ok(())
     }
 
@@ -2680,7 +2680,9 @@ impl Timeline {
             Some(self.write_lock.lock().await)
         };
         let mut guard = self.layers.write().await;
-        guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
+        guard
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .await;
     }
 
     /// Layer flusher task's main loop.
@@ -2962,7 +2964,11 @@ impl Timeline {
             let frozen_layer = Arc::clone(frozen_layer);
             move || {
                 // Write it out
-                let new_delta = frozen_layer.write_to_disk()?;
+                // Keep this inside `spawn_blocking` and `Handle::current`
+                // as long as the write path is still sync and the read impl
+                // is still not fully async. Otherwise executor threads would
+                // be blocked.
+                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
                 let new_delta_path = new_delta.path();
 
                 // Sync it to disk.
@@ -3653,98 +3659,105 @@ impl Timeline {
         let mut key_values_total_size = 0u64;
         let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for (key, lsn, value_ref) in all_values_iter {
-            let value = value_ref.load()?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
-                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
-                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+
+        // TODO remove this block_on wrapper once we fully go async
+        Handle::current().block_on(async {
+            for (key, lsn, value_ref) in all_values_iter {
+                let value = value_ref.load().await?;
+                let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+                // We need to check key boundaries once we reach next key or end of layer with the same key
+                if !same_key || lsn == dup_end_lsn {
+                    let mut next_key_size = 0u64;
+                    let is_dup_layer = dup_end_lsn.is_valid();
+                    dup_start_lsn = Lsn::INVALID;
+                    if !same_key {
+                        dup_end_lsn = Lsn::INVALID;
+                    }
+                    // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                    for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                        next_key_size = next_size;
+                        if key != next_key {
+                            if dup_end_lsn.is_valid() {
+                                // We are writting segment with duplicates:
+                                // place all remaining values of this key in separate segment
+                                dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                                dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                            }
+                            break;
+                        }
+                        key_values_total_size += next_size;
+                        // Check if it is time to split segment: if total keys size is larger than target file size.
+                        // We need to avoid generation of empty segments if next_size > target_file_size.
+                        if key_values_total_size > target_file_size && lsn != next_lsn {
+                            // Split key between multiple layers: such layer can contain only single key
+                            dup_start_lsn = if dup_end_lsn.is_valid() {
+                                dup_end_lsn // new segment with duplicates starts where old one stops
+                            } else {
+                                lsn // start with the first LSN for this key
+                            };
+                            dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                            break;
                         }
-                        break;
                     }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
-                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
+                    // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                    if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                        dup_start_lsn = dup_end_lsn;
+                        dup_end_lsn = lsn_range.end;
                     }
-                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(Arc::new(
-                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
-                        ));
-                        writer = None;
-
-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
+                    if writer.is_some() {
+                        let written_size = writer.as_mut().unwrap().size();
+                        let contains_hole =
+                            next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                        // check if key cause layer overflow or contains hole...
+                        if is_dup_layer
+                            || dup_end_lsn.is_valid()
+                            || written_size + key_values_total_size > target_file_size
+                            || contains_hole
+                        {
+                            // ... if so, flush previous layer and prepare to write new one
+                            new_layers.push(Arc::new(
+                                writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                            ));
+                            writer = None;
+
+                            if contains_hole {
+                                // skip hole
+                                next_hole += 1;
+                            }
                         }
                     }
+                    // Remember size of key value because at next iteration we will access next item
+                    key_values_total_size = next_key_size;
+                }
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(DeltaLayerWriter::new(
+                        self.conf,
+                        self.timeline_id,
+                        self.tenant_id,
+                        key,
+                        if dup_end_lsn.is_valid() {
+                            // this is a layer containing slice of values of the same key
+                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                            dup_start_lsn..dup_end_lsn
+                        } else {
+                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                            lsn_range.clone()
+                        },
+                    )?);
                 }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
-            }
-            if writer.is_none() {
-                // Create writer if not initiaized yet
-                writer = Some(DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    key,
-                    if dup_end_lsn.is_valid() {
-                        // this is a layer containing slice of values of the same key
-                        debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                        dup_start_lsn..dup_end_lsn
-                    } else {
-                        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                        lsn_range.clone()
-                    },
-                )?);
-            }
 
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
-            });
+                fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                    Result::<_>::Err(anyhow::anyhow!(
+                        "failpoint delta-layer-writer-fail-before-finish"
+                    ))
+                });
 
-            writer.as_mut().unwrap().put_value(key, lsn, value)?;
-            prev_key = Some(key);
-        }
+                writer.as_mut().unwrap().put_value(key, lsn, value)?;
+                prev_key = Some(key);
+            }
+            Ok(())
+        })?;
         if let Some(writer) = writer {
             new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
         }
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 824d869bec275..40212265cf2d0 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -163,7 +163,7 @@ impl LayerManager {
     }
 
     /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub fn try_freeze_in_memory_layer(
+    pub async fn try_freeze_in_memory_layer(
         &mut self,
         Lsn(last_record_lsn): Lsn,
         last_freeze_at: &AtomicLsn,
@@ -173,7 +173,7 @@ impl LayerManager {
         if let Some(open_layer) = &self.layer_map.open_layer {
             let open_layer_rc = Arc::clone(open_layer);
             // Does this layer need freezing?
-            open_layer.freeze(end_lsn);
+            open_layer.freeze(end_lsn).await;
 
             // The layer is no longer open, update the layer map to reflect this.
             // We will replace it with on-disk historics below.