diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 69b5e2262008e..853f5f291910e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -814,7 +814,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.15.0-alpha1
+      VM_BUILDER_VERSION: v0.15.4
 
     steps:
       - name: Checkout
diff --git a/Dockerfile b/Dockerfile
index 9467e41ae40f1..1c447b2db99a3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -51,6 +51,7 @@ RUN set -e \
       --bin safekeeper  \
       --bin storage_broker  \
       --bin proxy  \
+      --bin neon_local \
       --locked --release \
     && cachepot -s
 
@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
 
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index da5d519ab51fd..1b5e4cf974f88 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -156,7 +156,7 @@ pub async fn get_available_extensions(
     let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
     let mut enabled_extensions = ext_index_full.public_extensions;
     enabled_extensions.extend_from_slice(custom_extensions);
-    let library_index = ext_index_full.library_index;
+    let mut library_index = ext_index_full.library_index;
     let all_extension_data = ext_index_full.extension_data;
     info!("library_index: {:?}", library_index);
 
@@ -179,6 +179,8 @@ pub async fn get_available_extensions(
                 file_create_tasks.push(tokio::fs::write(control_path, control_contents));
             } else {
                 warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
+                // also delete this from library index
+                library_index.retain(|_, value| value != extension_name);
             }
         }
     }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index eff7c93b461af..74e7796be78d6 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             }
             RoleAction::Create => {
                 let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("role create query: '{}'", &query);
diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs
new file mode 100644
index 0000000000000..4507efe64701c
--- /dev/null
+++ b/libs/utils/src/backoff.rs
@@ -0,0 +1,188 @@
+use std::fmt::{Debug, Display};
+
+use futures::Future;
+
+pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        tracing::info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+    }
+}
+
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// `description` argument is added to log messages. Its value should identify the `op` is doing
+pub async fn retry<T, O, F, E>(
+    mut op: O,
+    is_permanent: impl Fn(&E) -> bool,
+    warn_threshold: u32,
+    max_retries: u32,
+    description: &str,
+) -> Result<T, E>
+where
+    // Not std::error::Error because anyhow::Error doesnt implement it.
+    // For context see https://github.com/dtolnay/anyhow/issues/63
+    E: Display + Debug,
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, E>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    tracing::info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(ref e) if is_permanent(e) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(err) if attempts < warn_threshold => {
+                tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(err) if attempts < max_retries => {
+                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(ref err) => {
+                // Operation failed `max_attempts` times. Time to give up.
+                tracing::warn!(
+                    "{description} still failed after {attempts} retries, giving up: {err:?}"
+                );
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io;
+
+    use tokio::sync::Mutex;
+
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_always_error() {
+        let count = Mutex::new(0);
+        let err_result = retry(
+            || async {
+                *count.lock().await += 1;
+                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
+            },
+            |_e| false,
+            1,
+            1,
+            "work",
+        )
+        .await;
+
+        assert!(err_result.is_err());
+
+        assert_eq!(*count.lock().await, 2);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_ok_after_err() {
+        let count = Mutex::new(0);
+        retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| false,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn dont_retry_permanent_errors() {
+        let count = Mutex::new(0);
+        let _ = retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| true,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap_err();
+
+        assert_eq!(*count.lock().await, 1);
+    }
+}
diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index 090912d2765da..dfb7d5abbfa48 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,6 +24,20 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
     Ok(dir.next_entry().await?.is_none())
 }
 
+pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+
+    let mut content = vec![];
+    while let Some(next) = dir.next_entry().await? {
+        let file_name = next.file_name();
+        content.push(file_name.to_string_lossy().to_string());
+    }
+
+    Ok(content)
+}
+
 pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
     if e.kind() == io::ErrorKind::NotFound {
         Ok(())
@@ -43,7 +57,7 @@ where
 mod test {
     use std::path::PathBuf;
 
-    use crate::fs_ext::is_directory_empty;
+    use crate::fs_ext::{is_directory_empty, list_dir};
 
     use super::ignore_absent_files;
 
@@ -109,4 +123,25 @@ mod test {
 
         assert!(!file_path.exists());
     }
+
+    #[tokio::test]
+    async fn list_dir_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        assert!(list_dir(dir_path).await.unwrap().is_empty());
+
+        let file_path: PathBuf = dir_path.join("testfile");
+        let _ = std::fs::File::create(&file_path).unwrap();
+
+        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
+
+        let another_dir_path: PathBuf = dir_path.join("testdir");
+        std::fs::create_dir(another_dir_path).unwrap();
+
+        let expected = &["testdir", "testfile"];
+        let mut actual = list_dir(dir_path).await.unwrap();
+        actual.sort();
+        assert_eq!(actual, expected);
+    }
 }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index b591cc611aa52..5c8e01f41a513 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
 
+pub mod backoff;
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index e3ddefc661978..2d178a42bd2d5 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -72,7 +72,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
         .await?;
     let cursor = BlockCursor::new(&file);
     for (k, v) in all {
-        let value = cursor.read_blob(v.pos())?;
+        let value = cursor.read_blob(v.pos()).await?;
         println!("key:{} value_len:{}", k, value.len());
     }
     // TODO(chi): special handling for last key?
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a86993d7f3907..e0c969279ee7d 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -373,7 +373,7 @@ fn start_pageserver(
     let order = pageserver::InitializationOrder {
         initial_tenant_load: Some(init_done_tx),
         initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: init_logical_size_done_tx,
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
         background_jobs_can_start: background_jobs_barrier.clone(),
     };
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index be806c77ec05e..f2aa2f365eb4c 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,7 +31,9 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
+use crate::tenant::{
+    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+};
 use crate::{
     IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
     TIMELINE_UNINIT_MARK_SUFFIX,
@@ -613,6 +615,11 @@ impl PageServerConf {
         )
     }
 
+    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_DELETED_MARKER_FILE_NAME)
+    }
+
     pub fn traces_path(&self) -> PathBuf {
         self.workdir.join("traces")
     }
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index a1a5c30ae9aeb..2953208d1e9f2 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -85,6 +85,7 @@
 //! The solution is that all code paths are infected with precisely one
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
+
 use crate::task_mgr::TaskKind;
 
 // The main structure of this module, see module-level comment.
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
 pub struct RequestContext {
     task_kind: TaskKind,
     download_behavior: DownloadBehavior,
+    access_stats_behavior: AccessStatsBehavior,
 }
 
 /// Desired behavior if the operation requires an on-demand download
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
     Error,
 }
 
+/// Whether this request should update access times used in LRU eviction
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum AccessStatsBehavior {
+    /// Update access times: this request's access to data should be taken
+    /// as a hint that the accessed layer is likely to be accessed again
+    Update,
+
+    /// Do not update access times: this request is accessing the layer
+    /// but does not want to indicate that the layer should be retained in cache,
+    /// perhaps because the requestor is a compaction routine that will soon cover
+    /// this layer with another.
+    Skip,
+}
+
+pub struct RequestContextBuilder {
+    inner: RequestContext,
+}
+
+impl RequestContextBuilder {
+    /// A new builder with default settings
+    pub fn new(task_kind: TaskKind) -> Self {
+        Self {
+            inner: RequestContext {
+                task_kind,
+                download_behavior: DownloadBehavior::Download,
+                access_stats_behavior: AccessStatsBehavior::Update,
+            },
+        }
+    }
+
+    pub fn extend(original: &RequestContext) -> Self {
+        Self {
+            // This is like a Copy, but avoid implementing Copy because ordinary users of
+            // RequestContext should always move or ref it.
+            inner: RequestContext {
+                task_kind: original.task_kind,
+                download_behavior: original.download_behavior,
+                access_stats_behavior: original.access_stats_behavior,
+            },
+        }
+    }
+
+    /// Configure the DownloadBehavior of the context: whether to
+    /// download missing layers, and/or warn on the download.
+    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
+        self.inner.download_behavior = b;
+        self
+    }
+
+    /// Configure the AccessStatsBehavior of the context: whether layer
+    /// accesses should update the access time of the layer.
+    pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
+        self.inner.access_stats_behavior = b;
+        self
+    }
+
+    pub fn build(self) -> RequestContext {
+        self.inner
+    }
+}
+
 impl RequestContext {
     /// Create a new RequestContext that has no parent.
     ///
@@ -123,10 +186,9 @@ impl RequestContext {
     /// because someone explicitly canceled it.
     /// It has no parent, so it cannot inherit cancellation from there.
     pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        RequestContextBuilder::new(task_kind)
+            .download_behavior(download_behavior)
+            .build()
     }
 
     /// Create a detached child context for a task that may outlive `self`.
@@ -187,10 +249,7 @@ impl RequestContext {
     }
 
     fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        Self::new(task_kind, download_behavior)
     }
 
     pub fn task_kind(&self) -> TaskKind {
@@ -200,4 +259,8 @@ impl RequestContext {
     pub fn download_behavior(&self) -> DownloadBehavior {
         self.download_behavior
     }
+
+    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
+        self.access_stats_behavior
+    }
 }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 290492203a2b1..38e07f172d3d7 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,6 +93,47 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
+    delete:
+      description: |
+        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        404 means that deletion successfully finished"
+      responses:
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Tenant not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
 
   /v1/tenant/{tenant_id}/timeline:
     parameters:
@@ -820,6 +861,7 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
+
   /v1/tenant/config:
     put:
       description: |
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c10223beedefa..1e8dada85eb10 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                 format!("Cannot delete timeline which has child timelines: {children:?}")
                     .into_boxed_str(),
             ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
             Other(e) => ApiError::InternalServerError(e),
         }
     }
@@ -208,6 +208,19 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
     }
 }
 
+impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
+    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
+        use crate::tenant::delete::DeleteTenantError::*;
+        match value {
+            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
+            Timeline(t) => ApiError::from(t),
+            Other(o) => ApiError::InternalServerError(o),
+            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
@@ -617,6 +630,23 @@ async fn tenant_status(
     json_response(StatusCode::OK, tenant_info)
 }
 
+async fn tenant_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // TODO openapi spec
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
+        .instrument(info_span!("tenant_delete_handler", %tenant_id))
+        .await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 /// HTTP endpoint to query the current tenant_size of a tenant.
 ///
 /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -1345,6 +1375,9 @@ pub fn make_router(
         .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
         .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
         .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .delete("/v1/tenant/:tenant_id", |r| {
+            api_handler(r, tenant_delete_handler)
+        })
         .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
             api_handler(r, tenant_size_handler)
         })
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index a3a9497d57f33..4ff8ffcc4fc8b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -95,28 +95,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
     std::process::exit(exit_code);
 }
 
-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
-    let backoff_duration_seconds =
-        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
-    if backoff_duration_seconds > 0.0 {
-        info!(
-            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
-        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
-    }
-}
-
-pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
-    if n == 0 {
-        0.0
-    } else {
-        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
-    }
-}
-
 /// The name of the metadata file pageserver creates per timeline.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -190,7 +168,7 @@ pub struct InitializationOrder {
 
     /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
     /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: utils::completion::Completion,
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
 
     /// Barrier for when we can start any background jobs.
     ///
@@ -226,6 +204,7 @@ async fn timed<Fut: std::future::Future>(
 
             let ret = fut.await;
 
+            // this has a global allowed_errors
             tracing::warn!(
                 task = name,
                 elapsed_ms = started.elapsed().as_millis(),
@@ -237,37 +216,6 @@ async fn timed<Fut: std::future::Future>(
     }
 }
 
-#[cfg(test)]
-mod backoff_defaults_tests {
-    use super::*;
-
-    #[test]
-    fn backoff_defaults_produce_growing_backoff_sequence() {
-        let mut current_backoff_value = None;
-
-        for i in 0..10_000 {
-            let new_backoff_value = exponential_backoff_duration_seconds(
-                i,
-                DEFAULT_BASE_BACKOFF_SECONDS,
-                DEFAULT_MAX_BACKOFF_SECONDS,
-            );
-
-            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
-                assert!(
-                    old_backoff_value <= new_backoff_value,
-                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
-                )
-            }
-        }
-
-        assert_eq!(
-            current_backoff_value.expect("Should have produced backoff values to compare"),
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            "Given big enough of retries, backoff should reach its allowed max value"
-        );
-    }
-}
-
 #[cfg(test)]
 mod timed_tests {
     use super::timed;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a9885a5f700c1..524e3a60a1c07 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,6 +28,7 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::fmt::Debug;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -46,8 +47,10 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::config::TenantConf;
+use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
+use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
@@ -106,6 +109,7 @@ macro_rules! pausable_failpoint {
 
 pub mod blob_io;
 pub mod block_io;
+
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
@@ -118,6 +122,7 @@ mod remote_timeline_client;
 pub mod storage_layer;
 
 pub mod config;
+pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
@@ -145,6 +150,8 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
 
 pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";
 
+pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
+
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -183,6 +190,8 @@ pub struct Tenant {
     cached_synthetic_tenant_size: Arc<AtomicU64>,
 
     eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
+
+    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }
 
 // We should not blindly overwrite local metadata with remote one.
@@ -274,7 +283,7 @@ pub enum LoadLocalTimelineError {
     ResumeDeletion(#[source] anyhow::Error),
 }
 
-#[derive(Debug, thiserror::Error)]
+#[derive(thiserror::Error)]
 pub enum DeleteTimelineError {
     #[error("NotFound")]
     NotFound,
@@ -283,17 +292,37 @@ pub enum DeleteTimelineError {
     HasChildren(Vec<TimelineId>),
 
     #[error("Timeline deletion is already in progress")]
-    AlreadyInProgress,
+    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
 
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
+impl Debug for DeleteTimelineError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NotFound => write!(f, "NotFound"),
+            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
+            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
+        }
+    }
+}
+
 pub enum SetStoppingError {
     AlreadyStopping(completion::Barrier),
     Broken,
 }
 
+impl Debug for SetStoppingError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
+            Self::Broken => write!(f, "Broken"),
+        }
+    }
+}
+
 struct RemoteStartupData {
     index_part: IndexPart,
     remote_metadata: TimelineMetadata,
@@ -616,7 +645,7 @@ impl Tenant {
         // For every timeline, download the metadata file, scan the local directory,
         // and build a layer map that contains an entry for each remote and local
         // layer file.
-        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
+        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
         for (timeline_id, remote_metadata) in sorted_timelines {
             let (index_part, remote_client) = remote_index_and_client
                 .remove(&timeline_id)
@@ -740,12 +769,13 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     #[instrument(skip_all, fields(tenant_id=%tenant_id))]
-    pub fn spawn_load(
+    pub(crate) fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
         init_order: Option<InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
         span::debug_assert_current_span_has_tenant_id();
@@ -765,7 +795,7 @@ impl Tenant {
             tenant_conf,
             wal_redo_manager,
             tenant_id,
-            remote_storage,
+            remote_storage.clone(),
         );
         let tenant = Arc::new(tenant);
 
@@ -781,27 +811,83 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
+                let make_broken = |t: &Tenant, err: anyhow::Error| {
+                    error!("load failed, setting tenant state to Broken: {err:?}");
+                    t.state.send_modify(|state| {
+                        assert!(
+                            matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
+                            "the loading task owns the tenant state until activation is complete"
+                        );
+                        *state = TenantState::broken_from_reason(err.to_string());
+                    });
+                };
+
                 let mut init_order = init_order;
 
                 // take the completion because initial tenant loading will complete when all of
                 // these tasks complete.
-                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
+                let _completion = init_order
+                    .as_mut()
+                    .and_then(|x| x.initial_tenant_load.take());
+
+                // Dont block pageserver startup on figuring out deletion status
+                let pending_deletion = {
+                    match DeleteTenantFlow::should_resume_deletion(
+                        conf,
+                        remote_storage.as_ref(),
+                        &tenant_clone,
+                    )
+                    .await
+                    {
+                        Ok(should_resume_deletion) => should_resume_deletion,
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                    }
+                };
+
+                info!("pending deletion {}", pending_deletion.is_some());
+
+                if let Some(deletion) = pending_deletion {
+                    // as we are no longer loading, signal completion by dropping
+                    // the completion while we resume deletion
+                    drop(_completion);
+                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
+                    let _ = init_order
+                        .as_mut()
+                        .and_then(|x| x.initial_logical_size_attempt.take());
+
+                    match DeleteTenantFlow::resume(
+                        deletion,
+                        &tenant_clone,
+                        init_order.as_ref(),
+                        tenants,
+                        &ctx,
+                    )
+                    .await
+                    {
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                        Ok(()) => return Ok(()),
+                    }
+                }
+
+                let background_jobs_can_start =
+                    init_order.as_ref().map(|x| &x.background_jobs_can_start);
 
                 match tenant_clone.load(init_order.as_ref(), &ctx).await {
                     Ok(()) => {
-                        debug!("load finished, activating");
-                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                        debug!("load finished",);
+
                         tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                     }
-                    Err(err) => {
-                        error!("load failed, setting tenant state to Broken: {err:?}");
-                        tenant_clone.state.send_modify(|state| {
-                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
-                            *state = TenantState::broken_from_reason(err.to_string());
-                        });
-                    }
+                    Err(err) => make_broken(&tenant_clone, err),
                 }
-               Ok(())
+
+                Ok(())
             }
             .instrument({
                 let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -877,6 +963,8 @@ impl Tenant {
                         )
                     })?;
 
+                info!("Found deletion mark for timeline {}", timeline_id);
+
                 match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
                     Ok(metadata) => {
                         timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
@@ -966,9 +1054,11 @@ impl Tenant {
 
         // Sort the array of timeline IDs into tree-order, so that parent comes before
         // all its children.
-        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
-            sorted_timelines_to_load: sorted_timelines,
-            timelines_to_resume_deletion,
+        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
+            TenantDirectoryScan {
+                sorted_timelines_to_load: sorted_timelines,
+                timelines_to_resume_deletion,
+            }
         })
     }
 
@@ -1682,7 +1772,7 @@ impl Tenant {
         // It's mesed up.
         // we just ignore the failure to stop
 
-        match self.set_stopping(shutdown_progress).await {
+        match self.set_stopping(shutdown_progress, false).await {
             Ok(()) => {}
             Err(SetStoppingError::Broken) => {
                 // assume that this is acceptable
@@ -1722,18 +1812,25 @@ impl Tenant {
     /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
     ///
     /// This function is not cancel-safe!
-    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
+    ///
+    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
+    async fn set_stopping(
+        &self,
+        progress: completion::Barrier,
+        allow_transition_from_loading: bool,
+    ) -> Result<(), SetStoppingError> {
         let mut rx = self.state.subscribe();
 
         // cannot stop before we're done activating, so wait out until we're done activating
         rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                 info!(
                     "waiting for {} to turn Active|Broken|Stopping",
                     <&'static str>::from(state)
                 );
                 false
             }
+            TenantState::Loading => allow_transition_from_loading,
             TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
         })
         .await
@@ -1742,9 +1839,16 @@ impl Tenant {
         // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
         let mut err = None;
         let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                 unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
             }
+            TenantState::Loading => {
+                if !allow_transition_from_loading {
+                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+                };
+                *current_state = TenantState::Stopping { progress };
+                true
+            }
             TenantState::Active => {
                 // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                 // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -1813,6 +1917,10 @@ impl Tenant {
         .expect("cannot drop self.state while on a &self method");
 
         // we now know we're done activating, let's see whether this task is the winner to transition into Broken
+        self.set_broken_no_wait(reason)
+    }
+
+    pub(crate) fn set_broken_no_wait(&self, reason: String) {
         self.state.send_modify(|current_state| {
             match *current_state {
                 TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
@@ -1878,22 +1986,28 @@ impl Tenant {
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
 /// perform a topological sort, so that the parent of each timeline comes
 /// before the children.
-fn tree_sort_timelines(
-    timelines: HashMap<TimelineId, TimelineMetadata>,
-) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
+/// E extracts the ancestor from T
+/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
+fn tree_sort_timelines<T, E>(
+    timelines: HashMap<TimelineId, T>,
+    extractor: E,
+) -> anyhow::Result<Vec<(TimelineId, T)>>
+where
+    E: Fn(&T) -> Option<TimelineId>,
+{
     let mut result = Vec::with_capacity(timelines.len());
 
     let mut now = Vec::with_capacity(timelines.len());
     // (ancestor, children)
-    let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
+    let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
         HashMap::with_capacity(timelines.len());
 
-    for (timeline_id, metadata) in timelines {
-        if let Some(ancestor_id) = metadata.ancestor_timeline() {
+    for (timeline_id, value) in timelines {
+        if let Some(ancestor_id) = extractor(&value) {
             let children = later.entry(ancestor_id).or_default();
-            children.push((timeline_id, metadata));
+            children.push((timeline_id, value));
         } else {
-            now.push((timeline_id, metadata));
+            now.push((timeline_id, value));
         }
     }
 
@@ -2062,7 +2176,7 @@ impl Tenant {
             remote_client,
             pg_version,
             initial_logical_size_can_start.cloned(),
-            initial_logical_size_attempt.cloned(),
+            initial_logical_size_attempt.cloned().flatten(),
             state,
         );
 
@@ -2146,6 +2260,7 @@ impl Tenant {
             cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
             cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
             eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
         }
     }
 
@@ -2162,6 +2277,7 @@ impl Tenant {
         // FIXME If the config file is not found, assume that we're attaching
         // a detached tenant and config is passed via attach command.
         // https://github.com/neondatabase/neon/issues/1555
+        // OR: we're loading after incomplete deletion that managed to remove config.
         if !target_config_path.exists() {
             info!("tenant config not found in {target_config_display}");
             return Ok(TenantConfOpt::default());
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 7dd53407e73f9..fd211e8276606 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,14 +21,14 @@ where
     R: BlockReader,
 {
     /// Read a blob into a new buffer.
-    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
         let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf)?;
+        self.read_blob_into_buf(offset, &mut buf).await?;
         Ok(buf)
     }
     /// Read blob into the given buffer. Any previous contents in the buffer
     /// are overwritten.
-    pub fn read_blob_into_buf(
+    pub async fn read_blob_into_buf(
         &self,
         offset: u64,
         dstbuf: &mut Vec<u8>,
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
new file mode 100644
index 0000000000000..38fc31f69c1cc
--- /dev/null
+++ b/pageserver/src/tenant/delete.rs
@@ -0,0 +1,559 @@
+use std::{
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use anyhow::Context;
+use pageserver_api::models::TenantState;
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use tokio::sync::OwnedMutexGuard;
+use tracing::{error, info, instrument, warn, Instrument, Span};
+
+use utils::{
+    backoff, completion, crashsafe, fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    context::RequestContext,
+    task_mgr::{self, TaskKind},
+    InitializationOrder,
+};
+
+use super::{
+    mgr::{GetTenantError, TenantsMap},
+    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    span,
+    timeline::delete::DeleteTimelineFlow,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
+};
+
+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
+
+#[derive(Debug, thiserror::Error)]
+pub enum DeleteTenantError {
+    #[error("GetTenant {0}")]
+    Get(#[from] GetTenantError),
+
+    #[error("Invalid state {0}. Expected Active or Broken")]
+    InvalidState(TenantState),
+
+    #[error("Tenant deletion is already in progress")]
+    AlreadyInProgress,
+
+    #[error("Timeline {0}")]
+    Timeline(#[from] DeleteTimelineError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
+
+fn remote_tenant_delete_mark_path(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> anyhow::Result<RemotePath> {
+    let tenant_remote_path = conf
+        .tenant_path(tenant_id)
+        .strip_prefix(&conf.workdir)
+        .context("Failed to strip workdir prefix")
+        .and_then(RemotePath::new)
+        .context("tenant path")?;
+    Ok(tenant_remote_path.join(Path::new("deleted")))
+}
+
+async fn create_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+
+    let data: &[u8] = &[];
+    backoff::retry(
+        || async {
+            remote_storage
+                .upload(data, 0, &remote_mark_path, None)
+                .await
+        },
+        |_e| false,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "mark_upload",
+    )
+    .await
+    .context("mark_upload")?;
+
+    Ok(())
+}
+
+async fn create_local_delete_mark(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+
+    Ok(())
+}
+
+async fn schedule_ordered_timeline_deletions(
+    tenant: &Arc<Tenant>,
+) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
+    // Tenant is stopping at this point. We know it will be deleted.
+    // No new timelines should be created.
+    // Tree sort timelines to delete from leafs to the root.
+    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
+    // can complete and remove timeline from the map in between our call to clone
+    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
+    // timelines.lock is currently synchronous so we cant hold it across await point.
+    // So just ignore NotFound error if we get it from `run`.
+    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
+    let timelines = tenant.timelines.lock().unwrap().clone();
+    let sorted =
+        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
+
+    let mut already_running_deletions = vec![];
+
+    for (timeline_id, _) in sorted.into_iter().rev() {
+        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
+            match e {
+                DeleteTimelineError::NotFound => {
+                    // Timeline deletion finished after call to clone above but before call
+                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
+                    continue;
+                }
+                DeleteTimelineError::AlreadyInProgress(guard) => {
+                    already_running_deletions.push((guard, timeline_id));
+                    continue;
+                }
+                e => return Err(DeleteTenantError::Timeline(e)),
+            }
+        }
+    }
+
+    Ok(already_running_deletions)
+}
+
+async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
+    // Assert timelines dir is empty.
+    if !fs_ext::is_directory_empty(timelines_path).await? {
+        // Display first 10 items in directory
+        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
+        return Err(DeleteTenantError::Other(anyhow::anyhow!(
+            "Timelines directory is not empty after all timelines deletion: {list:?}"
+        )));
+    }
+
+    Ok(())
+}
+
+async fn remove_tenant_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: Option<&GenericRemoteStorage>,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    if let Some(remote_storage) = remote_storage {
+        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+        backoff::retry(
+            || async { remote_storage.delete(&path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_tenant_remote_delete_mark",
+        )
+        .await
+        .context("remove_tenant_remote_delete_mark")?;
+    }
+    Ok(())
+}
+
+// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
+async fn cleanup_remaining_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let rm = |p: PathBuf, is_dir: bool| async move {
+        if is_dir {
+            tokio::fs::remove_dir(&p).await
+        } else {
+            tokio::fs::remove_file(&p).await
+        }
+        .or_else(fs_ext::ignore_not_found)
+        .with_context(|| {
+            let to_display = p.display();
+            format!("failed to delete {to_display}")
+        })
+    };
+
+    rm(conf.tenant_config_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-timelines-dir"
+        ))?
+    });
+
+    rm(conf.timelines_path(tenant_id), true).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-deleted-mark"
+        ))?
+    });
+
+    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-tenant-dir"
+        ))?
+    });
+
+    rm(conf.tenant_path(tenant_id), true).await?;
+
+    Ok(())
+}
+
+/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Upload remote deletion mark.
+/// 2. Create local mark file.
+/// 3. Shutdown tasks
+/// 4. Run ordered timeline deletions
+/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
+/// 6. Remove remote mark
+/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are three entrypoints to the process:
+/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there.
+/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
+#[derive(Default)]
+pub enum DeleteTenantFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTenantFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    // NOTE: static needed for background part.
+    // We assume that calling code sets up the span with tenant_id.
+    #[instrument(skip_all)]
+    pub(crate) async fn run(
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(), DeleteTenantError> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
+
+        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+            tenant.set_broken(format!("{e:#}")).await;
+            return Err(e);
+        }
+
+        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+
+        Ok(())
+    }
+
+    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
+    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
+    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
+    // So the solution is to set tenant state to broken.
+    async fn run_inner(
+        guard: &mut OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<(), DeleteTenantError> {
+        guard.mark_in_progress()?;
+
+        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-remote-mark"
+            ))?
+        });
+
+        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
+        // Though sounds scary, different mark name?
+        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
+        if let Some(remote_storage) = &remote_storage {
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
+                .await
+                .context("remote_mark")?
+        }
+
+        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-local-mark"
+            ))?
+        });
+
+        create_local_delete_mark(conf, &tenant.tenant_id)
+            .await
+            .context("local delete mark")?;
+
+        fail::fail_point!("tenant-delete-before-background", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-background"
+            ))?
+        });
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    pub async fn should_resume_deletion(
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
+        let acquire = |t: &Tenant| {
+            Some(
+                Arc::clone(&t.delete_progress)
+                    .try_lock_owned()
+                    .expect("we're the only owner during init"),
+            )
+        };
+
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+            return Ok(acquire(tenant));
+        }
+
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };
+
+        // If remote storage is there we rely on it
+        let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
+
+        let result = backoff::retry(
+            || async { remote_storage.download(&remote_mark_path).await },
+            |e| matches!(e, DownloadError::NotFound),
+            SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+            SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+            "fetch_tenant_deletion_mark",
+        )
+        .await;
+
+        match result {
+            Ok(_) => Ok(acquire(tenant)),
+            Err(DownloadError::NotFound) => Ok(None),
+            Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?,
+        }
+    }
+
+    pub(crate) async fn resume(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, true)
+            .await
+            .expect("cant be stopping or broken");
+
+        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
+        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+        if let Some(background) = background_jobs_can_start {
+            info!("waiting for backgound jobs barrier");
+            background.clone().wait().await;
+            info!("ready for backgound jobs barrier");
+        }
+
+        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
+        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
+        if timelines_path.exists() {
+            tenant.load(init_order, ctx).await.context("load")?;
+        }
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
+    async fn prepare(
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
+        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
+        // so at least for now allow deletions only for active tenants. TODO recheck
+        // Broken and Stopping is needed for retries.
+        if !matches!(
+            tenant.current_state(),
+            TenantState::Active | TenantState::Broken { .. }
+        ) {
+            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        }
+
+        let guard = Arc::clone(&tenant.delete_progress)
+            .try_lock_owned()
+            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+
+        fail::fail_point!("tenant-delete-before-shutdown", |_| {
+            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+        });
+
+        // make pageserver shutdown not to wait for our completion
+        let (_, progress) = completion::channel();
+
+        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
+        // i e it is an error to do:
+        // tenant.set_stopping
+        // tenant.shutdown
+        // Its also bad that we're holding tenants.read here.
+        // TODO relax set_stopping to be idempotent?
+        if tenant.shutdown(progress, false).await.is_err() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "tenant shutdown is already in progress"
+            )));
+        }
+
+        Ok((Arc::clone(tenant), guard))
+    }
+
+    fn schedule_background(
+        guard: OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+    ) {
+        let tenant_id = tenant.tenant_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            None,
+            "tenant_delete",
+            false,
+            async move {
+                if let Err(err) =
+                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
+                {
+                    error!("Error: {err:#}");
+                    tenant.set_broken(format!("{err:#}")).await;
+                };
+                Ok(())
+            }
+            .instrument({
+                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: OwnedMutexGuard<Self>,
+        conf: &PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: &Arc<Tenant>,
+    ) -> Result<(), DeleteTenantError> {
+        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
+        // Note that if deletion fails we dont mark timelines as broken,
+        // the whole tenant will become broken as by `Self::schedule_background` logic
+        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
+            .await
+            .context("schedule_ordered_timeline_deletions")?;
+
+        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-polling-ongoing-deletions"
+            ))?
+        });
+
+        // Wait for deletions that were already running at the moment when tenant deletion was requested.
+        // When we can lock deletion guard it means that corresponding timeline deletion finished.
+        for (guard, timeline_id) in already_running_timeline_deletions {
+            let flow = guard.lock().await;
+            if !flow.is_finished() {
+                return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                    "already running timeline deletion failed: {timeline_id}"
+                )));
+            }
+        }
+
+        let timelines_path = conf.timelines_path(&tenant.tenant_id);
+        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
+        if timelines_path.exists() {
+            // sanity check to guard against layout changes
+            ensure_timelines_dir_empty(&timelines_path)
+                .await
+                .context("timelines dir not empty")?;
+        }
+
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+
+        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
+            ))?
+        });
+
+        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
+            .await
+            .context("cleanup_remaining_fs_traces")?;
+
+        let mut locked = tenants.write().await;
+        if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
+        };
+
+        *guard = Self::Finished;
+
+        Ok(())
+    }
+}
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index b163d27b421e5..d0ebd3eb4e69b 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -20,6 +20,7 @@
 //!
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
+use either::Either;
 use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
@@ -256,103 +257,77 @@ where
     where
         V: FnMut(&[u8], u64) -> bool,
     {
-        self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
-    }
-
-    fn search_recurse<V>(
-        &self,
-        node_blknum: u32,
-        search_key: &[u8; L],
-        dir: VisitDirection,
-        visitor: &mut V,
-    ) -> Result<bool>
-    where
-        V: FnMut(&[u8], u64) -> bool,
-    {
-        // Locate the node.
-        let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
-
-        let node = OnDiskNode::deparse(node_buf.as_ref())?;
-        let prefix_len = node.prefix_len as usize;
-        let suffix_len = node.suffix_len as usize;
-
-        assert!(node.num_children > 0);
-
-        let mut keybuf = Vec::new();
-        keybuf.extend(node.prefix);
-        keybuf.resize(prefix_len + suffix_len, 0);
-
-        if dir == VisitDirection::Forwards {
-            // Locate the first match
-            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                Ok(idx) => idx,
-                Err(idx) => {
-                    if node.level == 0 {
-                        // Imagine that the node contains the following keys:
-                        //
-                        // 1
-                        // 3  <-- idx
-                        // 5
-                        //
-                        // If the search key is '2' and there is exact match,
-                        // the binary search would return the index of key
-                        // '3'. That's cool, '3' is the first key to return.
-                        idx
-                    } else {
-                        // This is an internal page, so each key represents a lower
-                        // bound for what's in the child page. If there is no exact
-                        // match, we have to return the *previous* entry.
-                        //
-                        // 1  <-- return this
-                        // 3  <-- idx
-                        // 5
-                        idx.saturating_sub(1)
+        let mut stack = Vec::new();
+        stack.push((self.root_blk, None));
+        while let Some((node_blknum, opt_iter)) = stack.pop() {
+            // Locate the node.
+            let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
+
+            let node = OnDiskNode::deparse(node_buf.as_ref())?;
+            let prefix_len = node.prefix_len as usize;
+            let suffix_len = node.suffix_len as usize;
+
+            assert!(node.num_children > 0);
+
+            let mut keybuf = Vec::new();
+            keybuf.extend(node.prefix);
+            keybuf.resize(prefix_len + suffix_len, 0);
+
+            let mut iter = if let Some(iter) = opt_iter {
+                iter
+            } else if dir == VisitDirection::Forwards {
+                // Locate the first match
+                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                    Ok(idx) => idx,
+                    Err(idx) => {
+                        if node.level == 0 {
+                            // Imagine that the node contains the following keys:
+                            //
+                            // 1
+                            // 3  <-- idx
+                            // 5
+                            //
+                            // If the search key is '2' and there is exact match,
+                            // the binary search would return the index of key
+                            // '3'. That's cool, '3' is the first key to return.
+                            idx
+                        } else {
+                            // This is an internal page, so each key represents a lower
+                            // bound for what's in the child page. If there is no exact
+                            // match, we have to return the *previous* entry.
+                            //
+                            // 1  <-- return this
+                            // 3  <-- idx
+                            // 5
+                            idx.saturating_sub(1)
+                        }
                     }
-                }
-            };
-            // idx points to the first match now. Keep going from there
-            let mut key_off = idx * suffix_len;
-            while idx < node.num_children as usize {
-                let suffix = &node.keys[key_off..key_off + suffix_len];
-                keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx);
-                #[allow(clippy::collapsible_if)]
-                if node.level == 0 {
-                    // leaf
-                    if !visitor(&keybuf, value.to_u64()) {
-                        return Ok(false);
+                };
+                Either::Left(idx..node.num_children.into())
+            } else {
+                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                    Ok(idx) => {
+                        // Exact match. That's the first entry to return, and walk
+                        // backwards from there.
+                        idx
                     }
-                } else {
-                    #[allow(clippy::collapsible_if)]
-                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
-                        return Ok(false);
+                    Err(idx) => {
+                        // No exact match. The binary search returned the index of the
+                        // first key that's > search_key. Back off by one, and walk
+                        // backwards from there.
+                        if let Some(idx) = idx.checked_sub(1) {
+                            idx
+                        } else {
+                            return Ok(false);
+                        }
                     }
-                }
-                idx += 1;
-                key_off += suffix_len;
-            }
-        } else {
-            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                Ok(idx) => {
-                    // Exact match. That's the first entry to return, and walk
-                    // backwards from there. (The loop below starts from 'idx -
-                    // 1', so add one here to compensate.)
-                    idx + 1
-                }
-                Err(idx) => {
-                    // No exact match. The binary search returned the index of the
-                    // first key that's > search_key. Back off by one, and walk
-                    // backwards from there. (The loop below starts from idx - 1,
-                    // so we don't need to subtract one here)
-                    idx
-                }
+                };
+                Either::Right((0..=idx).rev())
             };
 
-            // idx points to the first match + 1 now. Keep going from there.
-            let mut key_off = idx * suffix_len;
-            while idx > 0 {
-                idx -= 1;
-                key_off -= suffix_len;
+            // idx points to the first match now. Keep going from there
+            while let Some(idx) = iter.next() {
+                let key_off = idx * suffix_len;
                 let suffix = &node.keys[key_off..key_off + suffix_len];
                 keybuf[prefix_len..].copy_from_slice(suffix);
                 let value = node.value(idx);
@@ -363,12 +338,8 @@ where
                         return Ok(false);
                     }
                 } else {
-                    #[allow(clippy::collapsible_if)]
-                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
-                        return Ok(false);
-                    }
-                }
-                if idx == 0 {
+                    stack.push((node_blknum, Some(iter)));
+                    stack.push((value.to_blknum(), None));
                     break;
                 }
             }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index b088a3b6028ee..38a491f931c6a 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -401,17 +401,26 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+    #[tokio::test]
+    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
         let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
 
         let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
 
         let pos_foo = file.write_blob(b"foo")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
         let pos_bar = file.write_blob(b"bar")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
-        assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
+        assert_eq!(
+            b"bar",
+            file.block_cursor().read_blob(pos_bar).await?.as_slice()
+        );
 
         let mut blobs = Vec::new();
         for i in 0..10000 {
@@ -428,7 +437,7 @@ mod tests {
 
         let cursor = BlockCursor::new(&file);
         for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos)?;
+            let actual = cursor.read_blob(pos).await?;
             assert_eq!(actual, expected);
         }
 
@@ -437,7 +446,7 @@ mod tests {
         large_data.resize(20000, 0);
         thread_rng().fill_bytes(&mut large_data);
         let pos_large = file.write_blob(&large_data)?;
-        let result = file.block_cursor().read_blob(pos_large)?;
+        let result = file.block_cursor().read_blob(pos_large).await?;
         assert_eq!(result, large_data);
 
         Ok(())
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 2635953e6dabb..ae6d237066c42 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,17 +20,19 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
+use super::delete::DeleteTenantError;
 use super::timeline::delete::DeleteTimelineFlow;
 
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
-enum TenantsMap {
+pub(crate) enum TenantsMap {
     /// [`init_tenant_mgr`] is not done yet.
     Initializing,
     /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
@@ -42,13 +44,13 @@ enum TenantsMap {
 }
 
 impl TenantsMap {
-    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
         match self {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
         }
     }
-    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
         match self {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
@@ -97,7 +99,9 @@ pub async fn init_tenant_mgr(
                         );
                     }
                 } else {
-                    // This case happens if we crash during attach before creating the attach marker file
+                    // This case happens if we:
+                    // * crash during attach before creating the attach marker file
+                    // * crash during tenant delete before removing tenant directory
                     let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
                         format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
                     })?;
@@ -124,6 +128,7 @@ pub async fn init_tenant_mgr(
                         broker_client.clone(),
                         remote_storage.clone(),
                         Some(init_order.clone()),
+                        &TENANTS,
                         &ctx,
                     ) {
                         Ok(tenant) => {
@@ -154,12 +159,13 @@ pub async fn init_tenant_mgr(
     Ok(())
 }
 
-pub fn schedule_local_tenant_processing(
+pub(crate) fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     init_order: Option<InitializationOrder>,
+    tenants: &'static tokio::sync::RwLock<TenantsMap>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -219,6 +225,7 @@ pub fn schedule_local_tenant_processing(
             broker_client,
             remote_storage,
             init_order,
+            tenants,
             ctx,
         )
     };
@@ -356,7 +363,7 @@ pub async fn create_tenant(
         //       See https://github.com/neondatabase/neon/issues/4233
 
         let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -417,6 +424,14 @@ pub async fn get_tenant(
     }
 }
 
+pub async fn delete_tenant(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant_id: TenantId,
+) -> Result<(), DeleteTenantError> {
+    DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
     #[error("Tenant {0}")]
@@ -432,7 +447,7 @@ pub async fn delete_timeline(
     _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
     let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
     Ok(())
 }
 
@@ -507,7 +522,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -588,7 +603,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 8d002a8570617..1d04d748396b1 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
+use utils::backoff::{
+    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 
 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
@@ -241,7 +244,6 @@ use crate::{
     tenant::upload_queue::{
         UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
     },
-    {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };
 
 use utils::id::{TenantId, TimelineId};
@@ -256,12 +258,12 @@ use super::upload_queue::SetDeletedFlagProgress;
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_DOWNLOAD_RETRIES times, we give up
-const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
-const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
 
 // Similarly log failed uploads and deletions at WARN level, after this many
 // retries. Uploads and deletions are retried forever, though.
-const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
@@ -752,12 +754,24 @@ impl RemoteTimelineClient {
 
         pausable_failpoint!("persist_deleted_index_part");
 
-        upload::upload_index_part(
-            self.conf,
-            &self.storage_impl,
-            &self.tenant_id,
-            &self.timeline_id,
-            &index_part_with_deleted_at,
+        backoff::retry(
+            || async {
+                upload::upload_index_part(
+                    self.conf,
+                    &self.storage_impl,
+                    &self.tenant_id,
+                    &self.timeline_id,
+                    &index_part_with_deleted_at,
+                )
+                .await
+            },
+            |_e| false,
+            1,
+            // have just a couple of attempts
+            // when executed as part of timeline deletion this happens in context of api call
+            // when executed as part of tenant deletion this happens in the background
+            2,
+            "persist_index_part_with_deleted_flag",
         )
         .await?;
 
@@ -834,10 +848,19 @@ impl RemoteTimelineClient {
         let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
         let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
 
-        let remaining = self
-            .storage_impl
-            .list_prefixes(Some(&timeline_storage_path))
-            .await?;
+        let remaining = backoff::retry(
+            || async {
+                self.storage_impl
+                    .list_prefixes(Some(&timeline_storage_path))
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "list_prefixes",
+        )
+        .await
+        .context("list prefixes")?;
 
         let remaining: Vec<RemotePath> = remaining
             .into_iter()
@@ -852,7 +875,15 @@ impl RemoteTimelineClient {
             .collect();
 
         if !remaining.is_empty() {
-            self.storage_impl.delete_objects(&remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+            )
+            .await
+            .context("delete_objects")?;
         }
 
         fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -864,7 +895,16 @@ impl RemoteTimelineClient {
         let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
 
         debug!("deleting index part");
-        self.storage_impl.delete(&index_file_path).await?;
+
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+        )
+        .await
+        .context("delete_index")?;
 
         fail::fail_point!("timeline-delete-after-index-delete", |_| {
             Err(anyhow::anyhow!(
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 64f4a0a113675..0a6fd03887405 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,19 +11,17 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-
-use tracing::{info, warn};
+use utils::backoff;
 
 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 
 use super::index::{IndexPart, LayerFileMetadata};
-use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
+use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
 
 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
     fs::File::open(path).await?.sync_all().await
@@ -268,7 +266,6 @@ pub(super) async fn download_index_part(
     Ok(index_part)
 }
 
-///
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -276,47 +273,17 @@ pub(super) async fn download_index_part(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
 where
     O: FnMut() -> F,
     F: Future<Output = Result<T, DownloadError>>,
 {
-    let mut attempts = 0;
-    loop {
-        let result = op().await;
-        match result {
-            Ok(_) => {
-                if attempts > 0 {
-                    info!("{description} succeeded after {attempts} retries");
-                }
-                return result;
-            }
-
-            // These are "permanent" errors that should not be retried.
-            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
-                return result;
-            }
-            // Assume that any other failure might be transient, and the operation might
-            // succeed if we just keep trying.
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
-                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
-                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(ref err)) => {
-                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
-                return result;
-            }
-        }
-        // sleep and retry
-        exponential_backoff(
-            attempts,
-            DEFAULT_BASE_BACKOFF_SECONDS,
-            DEFAULT_MAX_BACKOFF_SECONDS,
-        )
-        .await;
-        attempts += 1;
-    }
+    backoff::retry(
+        op,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        description,
+    )
+    .await
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 951fa38d8d49c..4107161aa1839 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,7 +8,7 @@ mod layer_desc;
 mod remote_layer;
 
 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
@@ -241,10 +241,14 @@ impl LayerAccessStats {
         });
     }
 
-    fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
+    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
+        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
+            return;
+        }
+
         let this_access = LayerAccessStatFullDetails {
             when: SystemTime::now(),
-            task_kind,
+            task_kind: ctx.task_kind(),
             access_kind,
         };
 
@@ -252,7 +256,7 @@ impl LayerAccessStats {
         locked.iter_mut().for_each(|inner| {
             inner.first_access.get_or_insert(this_access);
             inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= task_kind;
+            inner.task_kind_flag |= ctx.task_kind();
             inner.last_accesses.write(this_access);
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 037f56663d116..725ce7072ca5b 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -51,6 +51,7 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use tokio::runtime::Handle;
 use tokio::sync::OnceCell;
 use tracing::*;
 
@@ -280,7 +281,8 @@ impl Layer for DeltaLayer {
 
         // A subroutine to dump a single blob
         let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
-            let buf = cursor.read_blob(blob_ref.pos())?;
+            // TODO this is not ideal, but on the other hand we are in dumping code...
+            let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?;
             let val = Value::des(&buf)?;
             let desc = match val {
                 Value::Image(img) => {
@@ -335,7 +337,6 @@ impl Layer for DeltaLayer {
         let inner = self
             .load(LayerAccessKind::GetValueReconstructData, ctx)
             .await?;
-
         inner
             .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
             .await
@@ -452,8 +453,7 @@ impl DeltaLayer {
         access_kind: LayerAccessKind,
         ctx: &RequestContext,
     ) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
         // Quick exit if already loaded
         self.inner
             .get_or_try_init(|| self.load_inner())
@@ -909,12 +909,15 @@ impl DeltaLayerInner {
         let cursor = file.block_cursor();
         let mut buf = Vec::new();
         for (entry_lsn, pos) in offsets {
-            cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
-                format!(
-                    "Failed to read blob from virtual file {}",
-                    file.file.path.display()
-                )
-            })?;
+            cursor
+                .read_blob_into_buf(pos, &mut buf)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
             let val = Value::des(&buf).with_context(|| {
                 format!(
                     "Failed to deserialize file blob from virtual file {}",
@@ -1014,9 +1017,9 @@ pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
 
 impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
     /// Loads the value from disk
-    pub fn load(&self) -> Result<Value> {
+    pub async fn load(&self) -> Result<Value> {
         // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos())?;
+        let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
         let val = Value::des(&buf)?;
         Ok(val)
     }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 2824abba754b4..3fe600e5fffae 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -323,8 +323,7 @@ impl ImageLayer {
         access_kind: LayerAccessKind,
         ctx: &RequestContext,
     ) -> Result<&ImageLayerInner> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
         self.inner
             .get_or_try_init(|| self.load_inner())
             .await
@@ -471,6 +470,7 @@ impl ImageLayerInner {
             let blob = file
                 .block_cursor()
                 .read_blob(offset)
+                .await
                 .with_context(|| format!("failed to read value from offset {}", offset))?;
             let value = Bytes::from(blob);
 
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 3d222fcb1ea32..aa9d0884e000c 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -16,6 +16,7 @@ use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::cell::RefCell;
 use std::collections::HashMap;
+use std::sync::OnceLock;
 use tracing::*;
 use utils::{
     bin_ser::BeSer,
@@ -27,7 +28,7 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::sync::RwLock;
+use tokio::sync::RwLock;
 
 use super::{DeltaLayer, DeltaLayerWriter, Layer};
 
@@ -42,14 +43,16 @@ pub struct InMemoryLayer {
     tenant_id: TenantId,
     timeline_id: TimelineId,
 
-    ///
     /// This layer contains all the changes from 'start_lsn'. The
     /// start is inclusive.
-    ///
     start_lsn: Lsn,
 
-    /// The above fields never change. The parts that do change are in 'inner',
-    /// and protected by mutex.
+    /// Frozen layers have an exclusive end LSN.
+    /// Writes are only allowed when this is `None`.
+    end_lsn: OnceLock<Lsn>,
+
+    /// The above fields never change, except for `end_lsn`, which is only set once.
+    /// All other changing parts are in `inner`, and protected by a mutex.
     inner: RwLock<InMemoryLayerInner>,
 }
 
@@ -57,21 +60,16 @@ impl std::fmt::Debug for InMemoryLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("InMemoryLayer")
             .field("start_lsn", &self.start_lsn)
+            .field("end_lsn", &self.end_lsn)
             .field("inner", &self.inner)
             .finish()
     }
 }
 
 pub struct InMemoryLayerInner {
-    /// Frozen layers have an exclusive end LSN.
-    /// Writes are only allowed when this is None
-    end_lsn: Option<Lsn>,
-
-    ///
     /// All versions of all pages in the layer are kept here.  Indexed
     /// by block number and LSN. The value is an offset into the
     /// ephemeral file where the page version is stored.
-    ///
     index: HashMap<Key, VecMap<Lsn, u64>>,
 
     /// The values are stored in a serialized format in this file.
@@ -82,15 +80,7 @@ pub struct InMemoryLayerInner {
 
 impl std::fmt::Debug for InMemoryLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InMemoryLayerInner")
-            .field("end_lsn", &self.end_lsn)
-            .finish()
-    }
-}
-
-impl InMemoryLayerInner {
-    fn assert_writeable(&self) {
-        assert!(self.end_lsn.is_none());
+        f.debug_struct("InMemoryLayerInner").finish()
     }
 }
 
@@ -101,13 +91,21 @@ impl InMemoryLayer {
 
     pub fn info(&self) -> InMemoryLayerInfo {
         let lsn_start = self.start_lsn;
-        let lsn_end = self.inner.read().unwrap().end_lsn;
 
-        match lsn_end {
-            Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
-            None => InMemoryLayerInfo::Open { lsn_start },
+        if let Some(&lsn_end) = self.end_lsn.get() {
+            InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
+        } else {
+            InMemoryLayerInfo::Open { lsn_start }
         }
     }
+
+    fn assert_writable(&self) {
+        assert!(self.end_lsn.get().is_none());
+    }
+
+    fn end_lsn_or_max(&self) -> Lsn {
+        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
+    }
 }
 
 #[async_trait::async_trait]
@@ -117,14 +115,7 @@ impl Layer for InMemoryLayer {
     }
 
     fn get_lsn_range(&self) -> Range<Lsn> {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
-            end_lsn
-        } else {
-            Lsn(u64::MAX)
-        };
-        self.start_lsn..end_lsn
+        self.start_lsn..self.end_lsn_or_max()
     }
 
     fn is_incremental(&self) -> bool {
@@ -134,13 +125,9 @@ impl Layer for InMemoryLayer {
 
     /// debugging function to print out the contents of the layer
     async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
 
-        let end_str = inner
-            .end_lsn
-            .as_ref()
-            .map(Lsn::to_string)
-            .unwrap_or_default();
+        let end_str = self.end_lsn_or_max();
 
         println!(
             "----- in-memory layer for tli {} LSNs {}-{} ----",
@@ -156,7 +143,7 @@ impl Layer for InMemoryLayer {
         for (key, vec_map) in inner.index.iter() {
             for (lsn, pos) in vec_map.as_slice() {
                 let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                 let val = Value::des(&buf);
                 match val {
                     Ok(Value::Image(img)) => {
@@ -194,7 +181,7 @@ impl Layer for InMemoryLayer {
         ensure!(lsn_range.start >= self.start_lsn);
         let mut need_image = true;
 
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
 
         let reader = inner.file.block_cursor();
 
@@ -202,7 +189,7 @@ impl Layer for InMemoryLayer {
         if let Some(vec_map) = inner.index.get(&key) {
             let slice = vec_map.slice_range(lsn_range);
             for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos)?;
+                let buf = reader.read_blob(*pos).await?;
                 let value = Value::des(&buf)?;
                 match value {
                     Value::Image(img) => {
@@ -236,9 +223,7 @@ impl Layer for InMemoryLayer {
 
 impl std::fmt::Display for InMemoryLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        let end_lsn = self.end_lsn_or_max();
         write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
     }
 }
@@ -247,8 +232,8 @@ impl InMemoryLayer {
     ///
     /// Get layer size on the disk
     ///
-    pub fn size(&self) -> Result<u64> {
-        let inner = self.inner.read().unwrap();
+    pub async fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().await;
         Ok(inner.file.size)
     }
 
@@ -270,8 +255,8 @@ impl InMemoryLayer {
             timeline_id,
             tenant_id,
             start_lsn,
+            end_lsn: OnceLock::new(),
             inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
                 index: HashMap::new(),
                 file,
             }),
@@ -282,10 +267,10 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
         trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
 
         let off = {
             SER_BUFFER.with(|x| -> Result<_> {
@@ -316,11 +301,11 @@ impl InMemoryLayer {
     /// Make the layer non-writeable. Only call once.
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
-    pub fn freeze(&self, end_lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
+    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;
 
         assert!(self.start_lsn < end_lsn);
-        inner.end_lsn = Some(end_lsn);
+        self.end_lsn.set(end_lsn).expect("end_lsn set only once");
 
         for vec_map in inner.index.values() {
             for (lsn, _pos) in vec_map.as_slice() {
@@ -332,7 +317,7 @@ impl InMemoryLayer {
     /// Write this frozen in-memory layer to disk.
     ///
     /// Returns a new delta layer with all the same data as this in-memory layer
-    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
         // write lock on it, so we shouldn't block anyone. There's one exception
@@ -342,14 +327,16 @@ impl InMemoryLayer {
         // lock, it will see that it's not writeable anymore and retry, but it
         // would have to wait until we release it. That race condition is very
         // rare though, so we just accept the potential latency hit for now.
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
+
+        let end_lsn = *self.end_lsn.get().unwrap();
 
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
             self.tenant_id,
             Key::MIN,
-            self.start_lsn..inner.end_lsn.unwrap(),
+            self.start_lsn..end_lsn,
         )?;
 
         let mut buf = Vec::new();
@@ -363,7 +350,7 @@ impl InMemoryLayer {
             let key = **key;
             // Write all page versions
             for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                 let will_init = Value::des(&buf)?.will_init();
                 delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
             }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0074933902879..9b7402b7c758a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -35,7 +35,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
+};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
@@ -800,10 +802,15 @@ impl Timeline {
             .await
         {
             Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
                 let layer_paths_to_upload = self
-                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
                     .await
                     .map_err(anyhow::Error::from)?;
                 if let Some(remote_client) = &self.remote_client {
@@ -876,7 +883,7 @@ impl Timeline {
             let Some(open_layer) = layers.open_layer.as_ref() else {
                 return Ok(());
             };
-            open_layer.size()?
+            open_layer.size().await?
         };
         let last_freeze_at = self.last_freeze_at.load();
         let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
@@ -920,7 +927,7 @@ impl Timeline {
     pub fn set_state(&self, new_state: TimelineState) {
         match (self.current_state(), new_state) {
             (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
             }
             (st, TimelineState::Loading) => {
                 error!("ignoring transition from {st:?} into Loading state");
@@ -2648,7 +2655,7 @@ impl Timeline {
     async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
         //info!("PUT: key {} at {}", key, lsn);
         let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val)?;
+        layer.put_value(key, lsn, val).await?;
         Ok(())
     }
 
@@ -2674,7 +2681,9 @@ impl Timeline {
             Some(self.write_lock.lock().await)
         };
         let mut guard = self.layers.write().await;
-        guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
+        guard
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .await;
     }
 
     /// Layer flusher task's main loop.
@@ -2956,7 +2965,11 @@ impl Timeline {
             let frozen_layer = Arc::clone(frozen_layer);
             move || {
                 // Write it out
-                let new_delta = frozen_layer.write_to_disk()?;
+                // Keep this inside `spawn_blocking` and `Handle::current`
+                // as long as the write path is still sync and the read impl
+                // is still not fully async. Otherwise executor threads would
+                // be blocked.
+                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
                 let new_delta_path = new_delta.path();
 
                 // Sync it to disk.
@@ -3656,101 +3669,108 @@ impl Timeline {
         let mut key_values_total_size = 0u64;
         let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
         let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for &DeltaEntry {
-            key, lsn, ref val, ..
-        } in all_values_iter
-        {
-            let value = val.load()?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
-                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
-                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+
+        // TODO remove this block_on wrapper once we fully go async
+        Handle::current().block_on(async {
+            for &DeltaEntry {
+                key, lsn, ref val, ..
+            } in all_values_iter
+            {
+                let value = value_ref.load().await?;
+                let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+                // We need to check key boundaries once we reach next key or end of layer with the same key
+                if !same_key || lsn == dup_end_lsn {
+                    let mut next_key_size = 0u64;
+                    let is_dup_layer = dup_end_lsn.is_valid();
+                    dup_start_lsn = Lsn::INVALID;
+                    if !same_key {
+                        dup_end_lsn = Lsn::INVALID;
+                    }
+                    // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                    for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                        next_key_size = next_size;
+                        if key != next_key {
+                            if dup_end_lsn.is_valid() {
+                                // We are writting segment with duplicates:
+                                // place all remaining values of this key in separate segment
+                                dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                                dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                            }
+                            break;
+                        }
+                        key_values_total_size += next_size;
+                        // Check if it is time to split segment: if total keys size is larger than target file size.
+                        // We need to avoid generation of empty segments if next_size > target_file_size.
+                        if key_values_total_size > target_file_size && lsn != next_lsn {
+                            // Split key between multiple layers: such layer can contain only single key
+                            dup_start_lsn = if dup_end_lsn.is_valid() {
+                                dup_end_lsn // new segment with duplicates starts where old one stops
+                            } else {
+                                lsn // start with the first LSN for this key
+                            };
+                            dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                            break;
                         }
-                        break;
                     }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
-                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
+                    // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                    if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                        dup_start_lsn = dup_end_lsn;
+                        dup_end_lsn = lsn_range.end;
                     }
-                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(Arc::new(
-                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
-                        ));
-                        writer = None;
-
-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
+                    if writer.is_some() {
+                        let written_size = writer.as_mut().unwrap().size();
+                        let contains_hole =
+                            next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                        // check if key cause layer overflow or contains hole...
+                        if is_dup_layer
+                            || dup_end_lsn.is_valid()
+                            || written_size + key_values_total_size > target_file_size
+                            || contains_hole
+                        {
+                            // ... if so, flush previous layer and prepare to write new one
+                            new_layers.push(Arc::new(
+                                writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                            ));
+                            writer = None;
+
+                            if contains_hole {
+                                // skip hole
+                                next_hole += 1;
+                            }
                         }
                     }
+                    // Remember size of key value because at next iteration we will access next item
+                    key_values_total_size = next_key_size;
+                }
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(DeltaLayerWriter::new(
+                        self.conf,
+                        self.timeline_id,
+                        self.tenant_id,
+                        key,
+                        if dup_end_lsn.is_valid() {
+                            // this is a layer containing slice of values of the same key
+                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                            dup_start_lsn..dup_end_lsn
+                        } else {
+                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                            lsn_range.clone()
+                        },
+                    )?);
                 }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
-            }
-            if writer.is_none() {
-                // Create writer if not initiaized yet
-                writer = Some(DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    key,
-                    if dup_end_lsn.is_valid() {
-                        // this is a layer containing slice of values of the same key
-                        debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                        dup_start_lsn..dup_end_lsn
-                    } else {
-                        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                        lsn_range.clone()
-                    },
-                )?);
-            }
 
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
-            });
+                fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                    Result::<_>::Err(anyhow::anyhow!(
+                        "failpoint delta-layer-writer-fail-before-finish"
+                    ))
+                });
 
-            writer.as_mut().unwrap().put_value(key, lsn, value)?;
-            prev_key = Some(key);
-        }
+                writer.as_mut().unwrap().put_value(key, lsn, value)?;
+                prev_key = Some(key);
+            }
+            Ok(())
+        })?;
         if let Some(writer) = writer {
             new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
         }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d99ca2d2927c2..dba6475c27587 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -219,27 +219,13 @@ async fn delete_local_layer_files(
             }
         };
 
-        let r = if metadata.is_dir() {
-            // There shouldnt be any directories inside timeline dir as of current layout.
+        if metadata.is_dir() {
+            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
             tokio::fs::remove_dir(entry.path()).await
         } else {
             tokio::fs::remove_file(entry.path()).await
-        };
-
-        if let Err(e) = r {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                warn!(
-                    timeline_dir=?local_timeline_directory,
-                    path=?entry.path().display(),
-                    "got not found err while removing timeline dir, proceeding anyway"
-                );
-                continue;
-            }
-            anyhow::bail!(anyhow::anyhow!(
-                "Failed to remove: {}. Error: {e}",
-                entry.path().display()
-            ));
         }
+        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
     }
 
     info!("finished deleting layer files, releasing layer_removal_cs.lock()");
@@ -359,10 +345,11 @@ impl DeleteTimelineFlow {
     // NB: If this fails half-way through, and is retried, the retry will go through
     // all the same steps again. Make sure the code here is idempotent, and don't
     // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
     pub async fn run(
         tenant: &Arc<Tenant>,
         timeline_id: TimelineId,
+        inplace: bool,
     ) -> Result<(), DeleteTimelineError> {
         let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
 
@@ -380,7 +367,11 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }
 
         Ok(())
     }
@@ -398,6 +389,8 @@ impl DeleteTimelineFlow {
     }
 
     /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
+    #[instrument(skip_all, fields(%timeline_id))]
     pub async fn resume_deletion(
         tenant: Arc<Tenant>,
         timeline_id: TimelineId,
@@ -444,11 +437,15 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
+    #[instrument(skip_all, fields(%timeline_id))]
     pub async fn cleanup_remaining_timeline_fs_traces(
         tenant: &Tenant,
         timeline_id: TimelineId,
     ) -> anyhow::Result<()> {
-        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+        let r =
+            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
+        info!("Done");
+        r
     }
 
     fn prepare(
@@ -494,11 +491,17 @@ impl DeleteTimelineFlow {
         // At the end of the operation we're holding the guard and need to lock timelines map
         // to remove the timeline from it.
         // Always if you have two locks that are taken in different order this can result in a deadlock.
-        let delete_lock_guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-        );
+
+        let delete_progress = Arc::clone(&timeline.delete_progress);
+        let delete_lock_guard = match delete_progress.try_lock_owned() {
+            Ok(guard) => DeletionGuard(guard),
+            Err(_) => {
+                // Unfortunately if lock fails arc is consumed.
+                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
+                    &timeline.delete_progress,
+                )));
+            }
+        };
 
         timeline.set_state(TimelineState::Stopping);
 
@@ -553,10 +556,14 @@ impl DeleteTimelineFlow {
 
         remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
 
-        *guard.0 = Self::Finished;
+        *guard = Self::Finished;
 
         Ok(())
     }
+
+    pub(crate) fn is_finished(&self) -> bool {
+        matches!(self, Self::Finished)
+    }
 }
 
 struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 824d869bec275..40212265cf2d0 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -163,7 +163,7 @@ impl LayerManager {
     }
 
     /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub fn try_freeze_in_memory_layer(
+    pub async fn try_freeze_in_memory_layer(
         &mut self,
         Lsn(last_record_lsn): Lsn,
         last_freeze_at: &AtomicLsn,
@@ -173,7 +173,7 @@ impl LayerManager {
         if let Some(open_layer) = &self.layer_map.open_layer {
             let open_layer_rc = Arc::clone(open_layer);
             // Does this layer need freezing?
-            open_layer.freeze(end_lsn);
+            open_layer.freeze(end_lsn).await;
 
             // The layer is no longer open, update the layer map to reflect this.
             // We will replace it with on-disk historics below.
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index e96aa41da0b79..d52b29f1756ef 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -31,8 +31,10 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;
 
-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
+use utils::backoff::{
+    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 use utils::{
     id::{NodeId, TenantTimelineId},
     lsn::Lsn,
diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c
index e1dcaa081d04f..9e1fc11756a20 100644
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -37,68 +37,14 @@ static XLogSegNo walpropSegNo = 0;
 
 /* START cloned file-local variables and functions from walsender.c */
 
-/*
- * xlogreader used for replication.  Note that a WAL sender doing physical
- * replication does not need xlogreader to read WAL, but it needs one to
- * keep a state of its work.
- */
-static XLogReaderState *xlogreader = NULL;
-
-/*
- * These variables keep track of the state of the timeline we're currently
- * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
- * the timeline is not the latest timeline on this server, and the server's
- * history forked off from that timeline at sendTimeLineValidUpto.
- */
-static TimeLineID sendTimeLine = 0;
-static TimeLineID sendTimeLineNextTLI = 0;
-static bool sendTimeLineIsHistoric = false;
-static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
-
-/*
- * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
- * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
- */
-static TimestampTz last_reply_timestamp = 0;
-
-/* Have we sent a heartbeat message asking for reply, since last reply? */
-static bool waiting_for_ping_response = false;
-
-static bool streamingDoneSending;
-static bool streamingDoneReceiving;
-
-/* Are we there yet? */
-static bool WalSndCaughtUp = false;
-
-/* Flags set by signal handlers for later service in main loop */
-static volatile sig_atomic_t got_STOPPING = false;
-
 /*
  * How far have we sent WAL already? This is also advertised in
  * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
  */
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
 
-/*
- * This is set while we are streaming. When not set
- * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
- * the main loop is responsible for checking got_STOPPING and terminating when
- * it's set (after streaming any remaining WAL).
- */
-static volatile sig_atomic_t replication_active = false;
-
-typedef void (*WalSndSendDataCallback) (void);
-static void WalSndLoop(WalSndSendDataCallback send_data);
-static void XLogSendPhysical(void);
-#if PG_VERSION_NUM >= 150000
-static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
-#else
-static XLogRecPtr GetStandbyFlushRecPtr(void);
-#endif
-
-static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-							  TimeLineID *tli_p);
-
+static void WalSndLoop(void);
+static void XLogBroadcastWalProposer(void);
 /* END cloned file-level variables and functions from walsender.c */
 
 int
@@ -506,7 +452,7 @@ XLogWalPropClose(XLogRecPtr recptr)
 /* START of cloned functions from walsender.c */
 
 /*
- * Handle START_REPLICATION command.
+ * Subscribe for new WAL and stream it in the loop to safekeepers.
  *
  * At the moment, this never returns, but an ereport(ERROR) will take us back
  * to the main loop.
@@ -524,18 +470,6 @@ StartProposerReplication(StartReplicationCmd *cmd)
 				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
 #endif
 
-	/* create xlogreader for physical replication */
-	xlogreader =
-		XLogReaderAllocate(wal_segment_size, NULL,
-						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
-									  .segment_close = wal_segment_close),
-						   NULL);
-
-	if (!xlogreader)
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-
 	/*
 	 * We assume here that we're logging enough information in the WAL for
 	 * log-shipping, since this is checked in PostmasterMain().
@@ -569,341 +503,61 @@ StartProposerReplication(StartReplicationCmd *cmd)
 	 * we keep this code around to lighten the load for when we need it.
 	 */
 #if PG_VERSION_NUM >= 150000
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr(&currTLI);
-	}
-	else
-		FlushPtr = GetFlushRecPtr(&currTLI);
+	FlushPtr = GetFlushRecPtr(&currTLI);
 #else
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr();
-	}
-	else
-		FlushPtr = GetFlushRecPtr();
-
+	FlushPtr = GetFlushRecPtr();
 	currTLI = ThisTimeLineID;
 #endif
 
+	/*
+	 * When we first start replication the standby will be behind the
+	 * primary. For some applications, for example synchronous
+	 * replication, it is important to have a clear state for this initial
+	 * catchup mode, so we can trigger actions when we change streaming
+	 * state later. We may stay in this state for a long time, which is
+	 * exactly why we want to be able to monitor whether or not we are
+	 * still here.
+	 */
+	WalSndSetState(WALSNDSTATE_CATCHUP);
 
-	if (cmd->timeline != 0)
-	{
-		XLogRecPtr	switchpoint;
-
-		sendTimeLine = cmd->timeline;
-		if (sendTimeLine == currTLI)
-		{
-			sendTimeLineIsHistoric = false;
-			sendTimeLineValidUpto = InvalidXLogRecPtr;
-		}
-		else
-		{
-			List	   *timeLineHistory;
-
-			sendTimeLineIsHistoric = true;
-
-			/*
-			 * Check that the timeline the client requested exists, and the
-			 * requested start location is on that timeline.
-			 */
-			timeLineHistory = readTimeLineHistory(currTLI);
-			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
-										 &sendTimeLineNextTLI);
-			list_free_deep(timeLineHistory);
-
-			/*
-			 * Found the requested timeline in the history. Check that
-			 * requested startpoint is on that timeline in our history.
-			 *
-			 * This is quite loose on purpose. We only check that we didn't
-			 * fork off the requested timeline before the switchpoint. We
-			 * don't check that we switched *to* it before the requested
-			 * starting point. This is because the client can legitimately
-			 * request to start replication from the beginning of the WAL
-			 * segment that contains switchpoint, but on the new timeline, so
-			 * that it doesn't end up with a partial segment. If you ask for
-			 * too old a starting point, you'll get an error later when we
-			 * fail to find the requested WAL segment in pg_wal.
-			 *
-			 * XXX: we could be more strict here and only allow a startpoint
-			 * that's older than the switchpoint, if it's still in the same
-			 * WAL segment.
-			 */
-			if (!XLogRecPtrIsInvalid(switchpoint) &&
-				switchpoint < cmd->startpoint)
-			{
-				ereport(ERROR,
-						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
-								LSN_FORMAT_ARGS(cmd->startpoint),
-								cmd->timeline),
-						 errdetail("This server's history forked from timeline %u at %X/%X.",
-								   cmd->timeline,
-								   LSN_FORMAT_ARGS(switchpoint))));
-			}
-			sendTimeLineValidUpto = switchpoint;
-		}
-	}
-	else
+	/*
+	 * Don't allow a request to stream from a future point in WAL that
+	 * hasn't been flushed to disk in this server yet.
+	 */
+	if (FlushPtr < cmd->startpoint)
 	{
-		sendTimeLine = currTLI;
-		sendTimeLineValidUpto = InvalidXLogRecPtr;
-		sendTimeLineIsHistoric = false;
+		ereport(ERROR,
+				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+						LSN_FORMAT_ARGS(cmd->startpoint),
+						LSN_FORMAT_ARGS(FlushPtr))));
 	}
 
-	streamingDoneSending = streamingDoneReceiving = false;
+	/* Start streaming from the requested point */
+	sentPtr = cmd->startpoint;
 
-	/* If there is nothing to stream, don't even enter COPY mode */
-	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
-	{
-		/*
-		 * When we first start replication the standby will be behind the
-		 * primary. For some applications, for example synchronous
-		 * replication, it is important to have a clear state for this initial
-		 * catchup mode, so we can trigger actions when we change streaming
-		 * state later. We may stay in this state for a long time, which is
-		 * exactly why we want to be able to monitor whether or not we are
-		 * still here.
-		 */
-		WalSndSetState(WALSNDSTATE_CATCHUP);
-
-		/*
-		 * Don't allow a request to stream from a future point in WAL that
-		 * hasn't been flushed to disk in this server yet.
-		 */
-		if (FlushPtr < cmd->startpoint)
-		{
-			ereport(ERROR,
-					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-							LSN_FORMAT_ARGS(cmd->startpoint),
-							LSN_FORMAT_ARGS(FlushPtr))));
-		}
+	/* Initialize shared memory status, too */
+	SpinLockAcquire(&MyWalSnd->mutex);
+	MyWalSnd->sentPtr = sentPtr;
+	SpinLockRelease(&MyWalSnd->mutex);
 
-		/* Start streaming from the requested point */
-		sentPtr = cmd->startpoint;
+	SyncRepInitConfig();
 
-		/* Initialize shared memory status, too */
-		SpinLockAcquire(&MyWalSnd->mutex);
-		MyWalSnd->sentPtr = sentPtr;
-		SpinLockRelease(&MyWalSnd->mutex);
+	/* Infinite send loop, never returns */
+	WalSndLoop();
 
-		SyncRepInitConfig();
-
-		/* Main loop of walsender */
-		replication_active = true;
-
-		WalSndLoop(XLogSendPhysical);
-
-		replication_active = false;
-		if (got_STOPPING)
-			proc_exit(0);
-		WalSndSetState(WALSNDSTATE_STARTUP);
-
-		Assert(streamingDoneSending && streamingDoneReceiving);
-	}
+	WalSndSetState(WALSNDSTATE_STARTUP);
 
 	if (cmd->slotname)
 		ReplicationSlotRelease();
-
-	/*
-	 * Copy is finished now. Send a single-row result set indicating the next
-	 * timeline.
-	 */
-	if (sendTimeLineIsHistoric)
-	{
-		char		startpos_str[8 + 1 + 8 + 1];
-		DestReceiver *dest;
-		TupOutputState *tstate;
-		TupleDesc	tupdesc;
-		Datum		values[2];
-		bool		nulls[2];
-
-		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
-				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
-
-		dest = CreateDestReceiver(DestRemoteSimple);
-		MemSet(nulls, false, sizeof(nulls));
-
-		/*
-		 * Need a tuple descriptor representing two columns. int8 may seem
-		 * like a surprising data type for this, but in theory int4 would not
-		 * be wide enough for this, as TimeLineID is unsigned.
-		 */
-		tupdesc = CreateTemplateTupleDesc(2);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
-								  INT8OID, -1, 0);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
-								  TEXTOID, -1, 0);
-
-		/* prepare for projection of tuple */
-		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
-
-		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
-		values[1] = CStringGetTextDatum(startpos_str);
-
-		/* send it to dest */
-		do_tup_output(tstate, values, nulls);
-
-		end_tup_output(tstate);
-	}
-
-	/* Send CommandComplete message */
-	EndReplicationCommand("START_STREAMING");
 }
 
-#if PG_VERSION_NUM >= 150000
-static XLogRecPtr
-GetStandbyFlushRecPtr(TimeLineID *tli)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	*tli = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == replayTLI && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-#else
 /*
- * Returns the latest point in WAL that has been safely flushed to disk, and
- * can be sent to the standby. This should only be called when in recovery,
- * ie. we're streaming to a cascaded standby.
- *
- * As a side-effect, ThisTimeLineID is updated to the TLI of the last
- * replayed WAL record.
+ * Main loop that waits for LSN updates and calls the walproposer.
+ * Synchronous replication sets latch in WalSndWakeup at walsender.c
  */
-static XLogRecPtr
-GetStandbyFlushRecPtr(void)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	ThisTimeLineID = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-#endif
-
-
-
-/* XLogReaderRoutine->segment_open callback */
-static void
-WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-				  TimeLineID *tli_p)
-{
-	char		path[MAXPGPATH];
-
-	/*-------
-	 * When reading from a historic timeline, and there is a timeline switch
-	 * within this segment, read from the WAL segment belonging to the new
-	 * timeline.
-	 *
-	 * For example, imagine that this server is currently on timeline 5, and
-	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
-	 * 0/13002088. In pg_wal, we have these files:
-	 *
-	 * ...
-	 * 000000040000000000000012
-	 * 000000040000000000000013
-	 * 000000050000000000000013
-	 * 000000050000000000000014
-	 * ...
-	 *
-	 * In this situation, when requested to send the WAL from segment 0x13, on
-	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
-	 * recovery prefers files from newer timelines, so if the segment was
-	 * restored from the archive on this server, the file belonging to the old
-	 * timeline, 000000040000000000000013, might not exist. Their contents are
-	 * equal up to the switchpoint, because at a timeline switch, the used
-	 * portion of the old segment is copied to the new file.  -------
-	 */
-	*tli_p = sendTimeLine;
-	if (sendTimeLineIsHistoric)
-	{
-		XLogSegNo	endSegNo;
-
-		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
-		if (nextSegNo == endSegNo)
-			*tli_p = sendTimeLineNextTLI;
-	}
-
-	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
-	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (state->seg.ws_file >= 0)
-		return;
-
-	/*
-	 * If the file is not found, assume it's because the standby asked for a
-	 * too old WAL segment that has already been removed or recycled.
-	 */
-	if (errno == ENOENT)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
-		errno = save_errno;
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("requested WAL segment %s has already been removed",
-						xlogfname)));
-	}
-	else
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not open file \"%s\": %m",
-						path)));
-}
-
-
-/* Main loop of walsender process that streams the WAL over Copy messages. */
 static void
-WalSndLoop(WalSndSendDataCallback send_data)
+WalSndLoop(void)
 {
-	/*
-	 * Initialize the last reply timestamp. That enables timeout processing
-	 * from hereon.
-	 */
-	last_reply_timestamp = GetCurrentTimestamp();
-	waiting_for_ping_response = false;
-
-	/*
-	 * Loop until we reach the end of this timeline or the client requests to
-	 * stop streaming.
-	 */
 	for (;;)
 	{
 		/* Clear any already-pending wakeups */
@@ -911,153 +565,41 @@ WalSndLoop(WalSndSendDataCallback send_data)
 
 		CHECK_FOR_INTERRUPTS();
 
-		/* Process any requests or signals received recently */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-			SyncRepInitConfig();
-		}
+		XLogBroadcastWalProposer();
 
-		/* always true */
-		if (am_wal_proposer)
-		{
-			send_data();
-			if (WalSndCaughtUp)
-			{
-				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-					WalSndSetState(WALSNDSTATE_STREAMING);
-				WalProposerPoll();
-				WalSndCaughtUp = false;
-			}
-			continue;
-		}
+		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+			WalSndSetState(WALSNDSTATE_STREAMING);
+		WalProposerPoll();
 	}
 }
 
 /*
- * Send out the WAL in its normal physical/stored form.
- *
- * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
- * but not yet sent to the client, and buffer it in the libpq output
- * buffer.
- *
- * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
- * otherwise WalSndCaughtUp is set to false.
+ * Notify walproposer about the new WAL position.
  */
 static void
-XLogSendPhysical(void)
+XLogBroadcastWalProposer(void)
 {
-	XLogRecPtr	SendRqstPtr;
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
-	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
-	TimeLineID	currTLI;
-
-	/* If requested switch the WAL sender to the stopping state. */
-	if (got_STOPPING)
-		WalSndSetState(WALSNDSTATE_STOPPING);
-
-	if (streamingDoneSending)
-	{
-		WalSndCaughtUp = true;
-		return;
-	}
 
-	/* Figure out how far we can safely send the WAL. */
-	if (sendTimeLineIsHistoric)
-	{
-		/*
-		 * Streaming an old timeline that's in this server's history, but is
-		 * not the one we're currently inserting or replaying. It can be
-		 * streamed up to the point where we switched off that timeline.
-		 */
-		SendRqstPtr = sendTimeLineValidUpto;
-	}
-	else if (am_cascading_walsender)
-	{
-		/*
-		 * Streaming the latest timeline on a standby.
-		 *
-		 * Attempt to send all WAL that has already been replayed, so that we
-		 * know it's valid. If we're receiving WAL through streaming
-		 * replication, it's also OK to send any WAL that has been received
-		 * but not replayed.
-		 *
-		 * The timeline we're recovering from can change, or we can be
-		 * promoted. In either case, the current timeline becomes historic. We
-		 * need to detect that so that we don't try to stream past the point
-		 * where we switched to another timeline. We check for promotion or
-		 * timeline switch after calculating FlushPtr, to avoid a race
-		 * condition: if the timeline becomes historic just after we checked
-		 * that it was still current, it's still be OK to stream it up to the
-		 * FlushPtr that was calculated before it became historic.
-		 */
-		bool		becameHistoric = false;
-#if PG_VERSION_NUM >= 150000
-		SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
-#else
-		SendRqstPtr = GetStandbyFlushRecPtr();
-		currTLI = ThisTimeLineID;
-#endif
-		if (!RecoveryInProgress())
-		{
-			/*
-			 * We have been promoted. RecoveryInProgress() updated
-			 * ThisTimeLineID to the new current timeline.
-			 */
-			am_cascading_walsender = false;
-			becameHistoric = true;
-		}
-		else
-		{
-			/*
-			 * Still a cascading standby. But is the timeline we're sending
-			 * still the one recovery is recovering from? currTLI was updated
-			 * by the GetStandbyFlushRecPtr() call above.
-			 */
-			if (sendTimeLine != currTLI)
-				becameHistoric = true;
-		}
-
-		if (becameHistoric)
-		{
-			/*
-			 * The timeline we were sending has become historic. Read the
-			 * timeline history file of the new timeline to see where exactly
-			 * we forked off from the timeline we were sending.
-			 */
-			List	   *history;
-
-			history = readTimeLineHistory(currTLI);
-			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
-
-			Assert(sendTimeLine < sendTimeLineNextTLI);
-			list_free_deep(history);
-
-			sendTimeLineIsHistoric = true;
+	/* Start from the last sent position */
+	startptr = sentPtr;
 
-			SendRqstPtr = sendTimeLineValidUpto;
-		}
-	}
-	else
-	{
-		/*
-		 * Streaming the current timeline on a primary.
-		 *
-		 * Attempt to send all data that's already been written out and
-		 * fsync'd to disk.  We cannot go further than what's been written out
-		 * given the current implementation of WALRead().  And in any case
-		 * it's unsafe to send WAL that is not securely down to disk on the
-		 * primary: if the primary subsequently crashes and restarts, standbys
-		 * must not have applied any WAL that got lost on the primary.
-		 */
+	/*
+	 * Streaming the current timeline on a primary.
+	 *
+	 * Attempt to send all data that's already been written out and
+	 * fsync'd to disk.  We cannot go further than what's been written out
+	 * given the current implementation of WALRead().  And in any case
+	 * it's unsafe to send WAL that is not securely down to disk on the
+	 * primary: if the primary subsequently crashes and restarts, standbys
+	 * must not have applied any WAL that got lost on the primary.
+	 */
 #if PG_VERSION_NUM >= 150000
-		SendRqstPtr = GetFlushRecPtr(NULL);
+	endptr = GetFlushRecPtr(NULL);
 #else
-		SendRqstPtr = GetFlushRecPtr();
+	endptr = GetFlushRecPtr();
 #endif
-	}
 
 	/*
 	 * Record the current system time as an approximation of the time at which
@@ -1083,91 +625,14 @@ XLogSendPhysical(void)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
-
-	/*
-	 * If this is a historic timeline and we've reached the point where we
-	 * forked to the next timeline, stop streaming.
-	 *
-	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
-	 * startup process will normally replay all WAL that has been received
-	 * from the primary, before promoting, but if the WAL streaming is
-	 * terminated at a WAL page boundary, the valid portion of the timeline
-	 * might end in the middle of a WAL record. We might've already sent the
-	 * first half of that partial WAL record to the cascading standby, so that
-	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
-	 * replay the partial WAL record either, so it can still follow our
-	 * timeline switch.
-	 */
-	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
-	{
-		/* close the current file. */
-		if (xlogreader->seg.ws_file >= 0)
-			wal_segment_close(xlogreader);
-
-		/* Send CopyDone */
-		pq_putmessage_noblock('c', NULL, 0);
-		streamingDoneSending = true;
-
-		WalSndCaughtUp = true;
-
-		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
-			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
-			 LSN_FORMAT_ARGS(sentPtr));
-		return;
-	}
+	LagTrackerWrite(endptr, GetCurrentTimestamp());
 
 	/* Do we have any work to do? */
-	Assert(sentPtr <= SendRqstPtr);
-	if (SendRqstPtr <= sentPtr)
-	{
-		WalSndCaughtUp = true;
+	Assert(startptr <= endptr);
+	if (endptr <= startptr)
 		return;
-	}
-
-	/*
-	 * Figure out how much to send in one message. If there's no more than
-	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
-	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
-	 *
-	 * The rounding is not only for performance reasons. Walreceiver relies on
-	 * the fact that we never split a WAL record across two messages. Since a
-	 * long WAL record is split at page boundary into continuation records,
-	 * page boundary is always a safe cut-off point. We also assume that
-	 * SendRqstPtr never points to the middle of a WAL record.
-	 */
-	startptr = sentPtr;
-	endptr = startptr;
-	endptr += MAX_SEND_SIZE;
 
-	/* if we went beyond SendRqstPtr, back off */
-	if (SendRqstPtr <= endptr)
-	{
-		endptr = SendRqstPtr;
-		if (sendTimeLineIsHistoric)
-			WalSndCaughtUp = false;
-		else
-			WalSndCaughtUp = true;
-	}
-	else
-	{
-		/* round down to page boundary. */
-		endptr -= (endptr % XLOG_BLCKSZ);
-		WalSndCaughtUp = false;
-	}
-
-	nbytes = endptr - startptr;
-	Assert(nbytes <= MAX_SEND_SIZE);
-
-	/* always true */
-	if (am_wal_proposer)
-	{
-		WalProposerBroadcast(startptr, endptr);
-	}
-	else
-	{
-		/* code removed for brevity */
-	}
+	WalProposerBroadcast(startptr, endptr);
 	sentPtr = endptr;
 
 	/* Update shared memory status */
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 22e766b5f13b9..3f4cee6e3483c 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,6 +8,7 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
 
@@ -47,7 +48,9 @@ impl Api {
                 .build()?;
 
             info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
             let response = self.endpoint.execute(request).await?;
+            info!(duration = ?start.elapsed(), "received http response");
             let body = match parse_body::<GetRoleSecret>(response).await {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
@@ -88,7 +91,9 @@ impl Api {
                 .build()?;
 
             info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
             let response = self.endpoint.execute(request).await?;
+            info!(duration = ?start.elapsed(), "received http response");
             let body = parse_body::<WakeCompute>(response).await?;
 
             // Unfortunately, ownership won't let us use `Option::ok_or` here.
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index e2812403806d5..755137807bc49 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -7,11 +7,14 @@ pub mod server;
 pub mod sql_over_http;
 pub mod websocket;
 
-use std::time::Duration;
+use std::{sync::Arc, time::Duration};
 
+use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time::Instant;
+use tracing::trace;
 
 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -20,13 +23,21 @@ use reqwest_middleware::RequestBuilder;
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
-    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
+    let client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
+        .build()
+        .expect("Failed to create http client");
+
+    reqwest_middleware::ClientBuilder::new(client)
         .with(reqwest_tracing::TracingMiddleware::default())
         .build()
 }
 
 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
         .timeout(default_timout)
         .build()
         .expect("Failed to create http client with timeout");
@@ -39,6 +50,10 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware
         // As per docs, "This middleware always errors when given requests with streaming bodies".
         // That's all right because we only use this client to send `serde_json::RawValue`, which
         // is not a stream.
+        //
+        // ex-maintainer note:
+        // this limitation can be fixed if streaming is necessary.
+        // retries will still not be performed, but it wont error immediately
         .with(RetryTransientMiddleware::new_with_policy(retry_policy))
         .build()
 }
@@ -81,6 +96,37 @@ impl Endpoint {
     }
 }
 
+/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
+use hyper::{
+    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
+    service::Service,
+};
+use reqwest::dns::{Addrs, Resolve, Resolving};
+#[derive(Debug)]
+pub struct GaiResolver(HyperGaiResolver);
+
+impl Default for GaiResolver {
+    fn default() -> Self {
+        Self(HyperGaiResolver::new())
+    }
+}
+
+impl Resolve for GaiResolver {
+    fn resolve(&self, name: Name) -> Resolving {
+        let this = &mut self.0.clone();
+        let start = Instant::now();
+        Box::pin(
+            Service::<Name>::call(this, name.clone()).map(move |result| {
+                let resolve_duration = start.elapsed();
+                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
+                result
+                    .map(|addrs| -> Addrs { Box::new(addrs) })
+                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
+            }),
+        )
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index a4eb13a10ba07..aa06a91bbae9d 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -27,14 +27,19 @@ struct QueryData {
     params: Vec<serde_json::Value>,
 }
 
+#[derive(serde::Deserialize)]
+struct BatchQueryData {
+    queries: Vec<QueryData>,
+}
+
 #[derive(serde::Deserialize)]
 #[serde(untagged)]
 enum Payload {
     Single(QueryData),
-    Batch(Vec<QueryData>),
+    Batch(BatchQueryData),
 }
 
-pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -214,7 +219,7 @@ pub async fn handle(
 
     if request_content_length > MAX_REQUEST_SIZE {
         return Err(anyhow::anyhow!(
-            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
+            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
         ));
     }
 
@@ -233,7 +238,7 @@ pub async fn handle(
         Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
             .await
             .map(|x| (x, HashMap::default())),
-        Payload::Batch(queries) => {
+        Payload::Batch(batch_query) => {
             let mut results = Vec::new();
             let mut builder = client.build_transaction();
             if let Some(isolation_level) = txn_isolation_level {
@@ -243,7 +248,7 @@ pub async fn handle(
                 builder = builder.read_only(true);
             }
             let transaction = builder.start().await?;
-            for query in queries {
+            for query in batch_query.queries {
                 let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                 match result {
                     Ok(r) => results.push(r),
@@ -292,13 +297,15 @@ async fn query_to_json<T: GenericClient>(
     // big.
     pin_mut!(row_stream);
     let mut rows: Vec<tokio_postgres::Row> = Vec::new();
-    let mut curret_size = 0;
+    let mut current_size = 0;
     while let Some(row) = row_stream.next().await {
         let row = row?;
-        curret_size += row.body_len();
+        current_size += row.body_len();
         rows.push(row);
-        if curret_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!("response too large"));
+        if current_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!(
+                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
+            ));
         }
     }
 
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index fec76c74f4943..794da1792940f 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -187,12 +187,16 @@ async fn ws_handler(
         let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
-        tokio::spawn(async move {
-            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
-            {
-                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
+        tokio::spawn(
+            async move {
+                if let Err(e) =
+                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
+                {
+                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                }
             }
-        });
+            .in_current_span(),
+        );
 
         // Return the response so the spawned future can continue.
         Ok(response)
@@ -217,6 +221,10 @@ async fn ws_handler(
                     },
                     None => Value::Null,
                 };
+                error!(
+                    ?code,
+                    "sql-over-http per-client task finished with an error: {e:#}"
+                );
                 (
                     json!({ "message": message, "code": code }),
                     HashMap::default(),
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index f11075a736cab..136d62b321026 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,8 +2,8 @@
 //! protocol commands.
 
 use anyhow::Context;
-use std::str;
 use std::str::FromStr;
+use std::str::{self};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span, Instrument};
 
@@ -11,6 +11,7 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
 
 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
+use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -44,7 +45,7 @@ pub struct SafekeeperPostgresHandler {
 /// Parsed Postgres command.
 enum SafekeeperPostgresCommand {
     StartWalPush,
-    StartReplication { start_lsn: Lsn },
+    StartReplication { start_lsn: Lsn, term: Option<Term> },
     IdentifySystem,
     TimelineStatus,
     JSONCtrl { cmd: AppendLogicalMessage },
@@ -55,15 +56,21 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
         Ok(SafekeeperPostgresCommand::StartWalPush)
     } else if cmd.starts_with("START_REPLICATION") {
         let re = Regex::new(
-            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
+            // We follow postgres START_REPLICATION LOGICAL options to pass term.
+            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)(?: \(term='(\d+)'\))?",
         )
         .unwrap();
-        let mut caps = re.captures_iter(cmd);
-        let start_lsn = caps
-            .next()
-            .map(|cap| Lsn::from_str(&cap[1]))
-            .context("parse start LSN from START_REPLICATION command")??;
-        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
+        let caps = re
+            .captures(cmd)
+            .context(format!("failed to parse START_REPLICATION command {}", cmd))?;
+        let start_lsn =
+            Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?;
+        let term = if let Some(m) = caps.get(2) {
+            Some(m.as_str().parse::<u64>().context("invalid term")?)
+        } else {
+            None
+        };
+        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn, term })
     } else if cmd.starts_with("IDENTIFY_SYSTEM") {
         Ok(SafekeeperPostgresCommand::IdentifySystem)
     } else if cmd.starts_with("TIMELINE_STATUS") {
@@ -218,8 +225,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                     .instrument(info_span!("WAL receiver", ttid = %span_ttid))
                     .await
             }
-            SafekeeperPostgresCommand::StartReplication { start_lsn } => {
-                self.handle_start_replication(pgb, start_lsn)
+            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
+                self.handle_start_replication(pgb, start_lsn, term)
                     .instrument(info_span!("WAL sender", ttid = %span_ttid))
                     .await
             }
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index a0d4bb35d709a..808177d2c439a 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,6 +2,7 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.
 
 use crate::handler::SafekeeperPostgresHandler;
+use crate::safekeeper::Term;
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
@@ -359,8 +360,12 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
+        term: Option<Term>,
     ) -> Result<(), QueryError> {
-        if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await {
+        if let Err(end) = self
+            .handle_start_replication_guts(pgb, start_pos, term)
+            .await
+        {
             // Log the result and probably send it to the client, closing the stream.
             pgb.handle_copy_stream_end(end).await;
         }
@@ -371,6 +376,7 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
+        term: Option<Term>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
         let tli =
@@ -440,6 +446,7 @@ impl SafekeeperPostgresHandler {
             start_pos,
             end_pos,
             stop_pos,
+            term,
             commit_lsn_watch_rx,
             ws_guard: ws_guard.clone(),
             wal_reader,
@@ -476,6 +483,10 @@ struct WalSender<'a, IO> {
     // If present, terminate after reaching this position; used by walproposer
     // in recovery.
     stop_pos: Option<Lsn>,
+    /// When streaming uncommitted part, the term the client acts as the leader
+    /// in. Streaming is stopped if local term changes to a different (higher)
+    /// value.
+    term: Option<Term>,
     commit_lsn_watch_rx: Receiver<Lsn>,
     ws_guard: Arc<WalSenderGuard>,
     wal_reader: WalReader,
@@ -518,8 +529,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 .0 as usize;
             send_size = min(send_size, self.send_buf.len());
             let send_buf = &mut self.send_buf[..send_size];
-            // read wal into buffer
-            send_size = self.wal_reader.read(send_buf).await?;
+            let send_size: usize;
+            {
+                // If uncommitted part is being pulled, check that the term is
+                // still the expected one.
+                let _term_guard = if let Some(t) = self.term {
+                    Some(self.tli.acquire_term(t).await?)
+                } else {
+                    None
+                };
+                // read wal into buffer
+                send_size = self.wal_reader.read(send_buf).await?
+            };
             let send_buf = &send_buf[..send_size];
 
             // and send it
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 9ec89b86d4375..df51c45affbf5 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -499,6 +499,19 @@ impl Timeline {
         false
     }
 
+    /// Ensure taht current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
+        let ss = self.write_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
     /// Returns whether s3 offloading is required and sets current status as
     /// matching it.
     pub async fn wal_backup_attend(&self) -> bool {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 7ba979745b922..7c7ca941639d2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -32,6 +32,7 @@
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
+from mypy_boto3_s3 import S3Client
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -432,7 +433,7 @@ def __init__(
         self.port_distributor = port_distributor
         self.remote_storage = remote_storage
         self.ext_remote_storage: Optional[S3Storage] = None
-        self.remote_storage_client: Optional[Any] = None
+        self.remote_storage_client: Optional[S3Client] = None
         self.remote_storage_users = remote_storage_users
         self.broker = broker
         self.run_id = run_id
@@ -875,7 +876,14 @@ def get_safekeeper_connstrs(self) -> str:
 
     def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
         """Get a timeline directory's path based on the repo directory of the test environment"""
-        return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
+
+    def tenant_dir(
+        self,
+        tenant_id: TenantId,
+    ) -> Path:
+        """Get a tenant directory's path based on the repo directory of the test environment"""
+        return self.repo_dir / "tenants" / str(tenant_id)
 
     def get_pageserver_version(self) -> str:
         bin_pageserver = str(self.neon_binpath / "pageserver")
@@ -1518,6 +1526,8 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
             ".*Error processing HTTP request: NotFound: Timeline .* was not found",
             ".*took more than expected to complete.*",
+            # these can happen during shutdown, but it should not be a reason to fail a test
+            ".*completed, took longer than expected.*",
         ]
 
     def start(
@@ -2817,8 +2827,15 @@ def check_restored_datadir_content(
     endpoint: Endpoint,
 ):
     # Get the timeline ID. We need it for the 'basebackup' command
-    timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
+
+    # many tests already checkpoint, but do it just in case
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CHECKPOINT")
 
+    # wait for pageserver to catch up
+    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
     # stop postgres to ensure that files won't change
     endpoint.stop()
 
@@ -2833,7 +2850,7 @@ def check_restored_datadir_content(
         {psql_path}                                    \
             --no-psqlrc                                \
             postgres://localhost:{env.pageserver.service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
          | tar -x -C {restored_dir_path}
     """
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 8c053c8073abc..a179ebdd09aeb 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -210,6 +210,10 @@ def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
         self.verbose_error(res)
 
+    def tenant_delete(self, tenant_id: TenantId):
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        self.verbose_error(res)
+
     def tenant_load(self, tenant_id: TenantId):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
         self.verbose_error(res)
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 119c99bb96846..6032ff5b6810f 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,9 +1,11 @@
 import time
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.remote_storage import RemoteStorageKind, S3Storage
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 
 
 def assert_tenant_state(
@@ -17,15 +19,6 @@ def assert_tenant_state(
     assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
 
 
-def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
-    tenants = pageserver_http.tenant_list()
-    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
-    assert len(matching) < 2
-    if len(matching) == 0:
-        return None
-    return matching[0]
-
-
 def remote_consistent_lsn(
     pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
@@ -199,20 +192,19 @@ def wait_timeline_detail_404(
     timeline_id: TimelineId,
     iterations: int,
 ):
-    last_exc = None
-    for _ in range(iterations):
-        time.sleep(0.250)
+    def timeline_is_missing():
+        data = {}
         try:
             data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.info(f"detail {data}")
+            log.info(f"timeline detail {data}")
         except PageserverApiException as e:
             log.debug(e)
             if e.status_code == 404:
                 return
 
-            last_exc = e
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")
 
-    raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}")
+    wait_until(iterations, interval=0.250, func=timeline_is_missing)
 
 
 def timeline_delete_wait_completed(
@@ -224,3 +216,72 @@ def timeline_delete_wait_completed(
 ):
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
     wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
+
+
+if TYPE_CHECKING:
+    # TODO avoid by combining remote storage related stuff in single type
+    # and just passing in this type instead of whole builder
+    from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
+    assert neon_env_builder.remote_storage_kind in (
+        RemoteStorageKind.MOCK_S3,
+        RemoteStorageKind.REAL_S3,
+    )
+    # For mypy
+    assert isinstance(neon_env_builder.remote_storage, S3Storage)
+    assert neon_env_builder.remote_storage_client is not None
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Bucket=neon_env_builder.remote_storage.bucket_name,
+        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
+    )
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
+def wait_tenant_status_404(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+    interval: float = 0.250,
+):
+    def tenant_is_missing():
+        data = {}
+        try:
+            data = pageserver_http.tenant_status(tenant_id)
+            log.info(f"tenant status {data}")
+        except PageserverApiException as e:
+            log.debug(e)
+            if e.status_code == 404:
+                return
+
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")
+
+    wait_until(iterations, interval=interval, func=tenant_is_missing)
+
+
+def tenant_delete_wait_completed(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+):
+    pageserver_http.tenant_delete(tenant_id=tenant_id)
+    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
+
+
+MANY_SMALL_LAYERS_TENANT_CONFIG = {
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    "checkpoint_distance": f"{1024**2}",
+    "image_creation_threshold": "100",
+}
+
+
+def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
+    return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 8
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 2c8b4f4303ae0..d03d2e7595f4c 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -6,13 +6,16 @@
 import tarfile
 import time
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Tuple, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar
 from urllib.parse import urlencode
 
 import allure
 from psycopg2.extensions import cursor
 
 from fixtures.log_helper import log
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import PgBin
 from fixtures.types import TimelineId
 
 Fn = TypeVar("Fn", bound=Callable[..., Any])
@@ -300,17 +303,13 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
     raise Exception("timed out while waiting for %s" % func) from last_exception
 
 
-def wait_while(number_of_iterations: int, interval: float, func):
+def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
     """
-    Wait until 'func' returns false, or throws an exception.
+    Fast way to populate data.
+    For more layers consider combining with these tenant settings:
+    {
+        "checkpoint_distance": 1024 ** 2,
+        "image_creation_threshold": 100,
+    }
     """
-    for i in range(number_of_iterations):
-        try:
-            if not func():
-                return
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            time.sleep(interval)
-            continue
-        except Exception:
-            return
-    raise Exception("timed out while waiting for %s" % func)
+    pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index c4bee1a1ea9db..b208616345e90 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -82,6 +82,7 @@ def upload_files(env):
 
 # Test downloading remote extension.
 @pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_remote_extensions(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
@@ -148,6 +149,7 @@ def test_remote_extensions(
 
 # Test downloading remote library.
 @pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_remote_library(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
@@ -205,10 +207,11 @@ def test_remote_library(
 # Here we test a complex extension
 # which has multiple extensions in one archive
 # using postgis as an example
-@pytest.mark.skipif(
-    RemoteStorageKind.REAL_S3 not in available_s3_storages(),
-    reason="skipping test because real s3 not enabled",
-)
+# @pytest.mark.skipif(
+#    RemoteStorageKind.REAL_S3 not in available_s3_storages(),
+#    reason="skipping test because real s3 not enabled",
+# )
+@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_multiple_extensions_one_archive(
     neon_env_builder: NeonEnvBuilder,
     pg_version: PgVersion,
@@ -253,6 +256,7 @@ def test_multiple_extensions_one_archive(
 # Run the test with mutliple simultaneous connections to an endpoint.
 # to ensure that the extension is downloaded only once.
 #
+@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_extension_download_after_restart(
     neon_env_builder: NeonEnvBuilder,
     pg_version: PgVersion,
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index ee600db2c60f7..f26d04e2f3021 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -56,10 +56,6 @@ def test_pg_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        # checkpoint one more time to ensure that the lsn we get is the latest one
-        endpoint.safe_psql("CHECKPOINT")
-
-        # Check that we restore the content of the datadir correctly
         check_restored_datadir_content(test_output_dir, env, endpoint)
 
 
@@ -166,9 +162,4 @@ def test_sql_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        # checkpoint one more time to ensure that the lsn we get is the latest one
-        endpoint.safe_psql("CHECKPOINT")
-        endpoint.safe_psql("select pg_current_wal_insert_lsn()")[0][0]
-
-        # Check that we restore the content of the datadir correctly
         check_restored_datadir_content(test_output_dir, env, endpoint)
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 35334ec7b245b..fabec6b5bc3e0 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -269,7 +269,9 @@ def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False)
         connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
         response = requests.post(
             f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
-            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
+            data=json.dumps(
+                {"queries": list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))}
+            ),
             headers={
                 "Content-Type": "application/sql",
                 "Neon-Connection-String": connstr,
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 68116985d3f01..d642a6d19093c 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -97,6 +97,11 @@ def test_remote_storage_backup_and_restore(
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
     timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )
+
     checkpoint_numbers = range(1, 3)
 
     for checkpoint_number in checkpoint_numbers:
@@ -168,9 +173,7 @@ def test_remote_storage_backup_and_restore(
     #
     # The initiated attach operation should survive the restart, and continue from where it was.
     env.pageserver.stop()
-    layer_download_failed_regex = (
-        r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
-    )
+    layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+"
     assert not env.pageserver.log_contains(
         layer_download_failed_regex
     ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
@@ -203,7 +206,7 @@ def test_remote_storage_backup_and_restore(
                 == f"{data}|{checkpoint_number}"
             )
 
-    log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
+    log.info("ensure that we needed to retry downloads due to test_remote_failures=1")
     assert env.pageserver.log_contains(layer_download_failed_regex)
 
 
diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py
index 494820ef8ef90..eb96a8faa47d4 100644
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -33,8 +33,4 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
             cur.execute(f"insert into t1 values ({i}, {j})")
         cur.execute("commit")
 
-    # force wal flush
-    cur.execute("checkpoint")
-
-    # Check that we can restore the content of the datadir correctly
     check_restored_datadir_content(test_output_dir, env, endpoint)
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
new file mode 100644
index 0000000000000..f0f6cc743c180
--- /dev/null
+++ b/test_runner/regress/test_tenant_delete.py
@@ -0,0 +1,262 @@
+import enum
+import os
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import (
+    MANY_SMALL_LAYERS_TENANT_CONFIG,
+    assert_prefix_empty,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
+    wait_tenant_status_404,
+    wait_until_tenant_active,
+    wait_until_tenant_state,
+)
+from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
+from fixtures.types import TenantId
+from fixtures.utils import run_pg_bench_small
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
+)
+def test_tenant_delete_smoke(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_bin: PgBin,
+):
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_delete_smoke",
+    )
+
+    env = neon_env_builder.init_start()
+
+    ps_http = env.pageserver.http_client()
+
+    # first try to delete non existing tenant
+    tenant_id = TenantId.generate()
+    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
+    with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
+        ps_http.tenant_delete(tenant_id=tenant_id)
+
+    env.neon_cli.create_tenant(
+        tenant_id=tenant_id,
+        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
+    )
+
+    # create two timelines one being the parent of another
+    parent = None
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_branch(
+            timeline, tenant_id=tenant_id, ancestor_branch_name=parent
+        )
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
+
+        parent = timeline
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+
+    tenant_path = env.tenant_dir(tenant_id=tenant_id)
+    assert not tenant_path.exists()
+
+    if remote_storage_kind in [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]:
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+
+class Check(enum.Enum):
+    RETRY_WITHOUT_RESTART = enum.auto()
+    RETRY_WITH_RESTART = enum.auto()
+
+
+FAILPOINTS = [
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+    "tenant-delete-before-polling-ongoing-deletions",
+    "tenant-delete-before-cleanup-remaining-fs-traces",
+    "tenant-delete-before-remove-timelines-dir",
+    "tenant-delete-before-remove-deleted-mark",
+    "tenant-delete-before-remove-tenant-dir",
+    # Some failpoints from timeline deletion
+    "timeline-delete-before-index-deleted-at",
+    "timeline-delete-before-rm",
+    "timeline-delete-before-index-delete",
+    "timeline-delete-after-rm-dir",
+]
+
+FAILPOINTS_BEFORE_BACKGROUND = [
+    "timeline-delete-before-schedule",
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+]
+
+
+def combinations():
+    result = []
+
+    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
+        remotes.append(RemoteStorageKind.REAL_S3)
+
+    for remote_storage_kind in remotes:
+        for delete_failpoint in FAILPOINTS:
+            if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
+                "timeline-delete-before-index-delete",
+            ):
+                # the above failpoint are not relevant for config without remote storage
+                continue
+
+            # Simulate failures for only one type of remote storage
+            # to avoid log pollution and make tests run faster
+            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+                simulate_failures = True
+            else:
+                simulate_failures = False
+            result.append((remote_storage_kind, delete_failpoint, simulate_failures))
+    return result
+
+
+@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
+@pytest.mark.parametrize("check", list(Check))
+def test_delete_tenant_exercise_crash_safety_failpoints(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    failpoint: str,
+    simulate_failures: bool,
+    check: Check,
+    pg_bin: PgBin,
+):
+    if simulate_failures:
+        neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
+    )
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    env.pageserver.allowed_errors.extend(
+        [
+            # From deletion polling
+            f".*NotFound: tenant {env.initial_tenant}.*",
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # We may leave some upload tasks in the queue. They're likely deletes.
+            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
+            # So by ignoring these instead of waiting for empty upload queue
+            # we execute more distinct code paths.
+            '.*stopping left-over name="remote upload".*',
+        ]
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
+    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
+        # generate enough layers
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        if remote_storage_kind is RemoteStorageKind.NOOP:
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        else:
+            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    # These failpoints are earlier than background task is spawned.
+    # so they result in api request failure.
+    if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
+        with pytest.raises(PageserverApiException, match=failpoint):
+            ps_http.tenant_delete(tenant_id)
+
+    else:
+        ps_http.tenant_delete(tenant_id)
+        tenant_info = wait_until_tenant_state(
+            pageserver_http=ps_http,
+            tenant_id=tenant_id,
+            expected_state="Broken",
+            iterations=iterations,
+        )
+
+        reason = tenant_info["state"]["data"]["reason"]
+        log.info(f"tenant broken: {reason}")
+
+        # failpoint may not be the only error in the stack
+        assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    if check is Check.RETRY_WITH_RESTART:
+        env.pageserver.stop()
+        env.pageserver.start()
+
+        if (
+            remote_storage_kind is RemoteStorageKind.NOOP
+            and failpoint == "tenant-delete-before-create-local-mark"
+        ):
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        elif failpoint in (
+            "tenant-delete-before-shutdown",
+            "tenant-delete-before-create-remote-mark",
+        ):
+            wait_until_tenant_active(
+                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
+            )
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        else:
+            # Pageserver should've resumed deletion after restart.
+            wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
+    elif check is Check.RETRY_WITHOUT_RESTART:
+        # this should succeed
+        # this also checks that delete can be retried even when tenant is in Broken state
+        ps_http.configure_failpoints((failpoint, "off"))
+
+        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+
+    # Check remote is impty
+    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    tenant_dir = env.tenant_dir(tenant_id)
+    # Check local is empty
+    assert not tenant_dir.exists()
+
+
+# TODO test concurrent deletions with "hang" failpoint
+# TODO test tenant delete continues after attach
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 932d8219543e0..b189510a9e412 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -66,6 +66,10 @@ def test_tenant_reattach(
     env.pageserver.allowed_errors.append(
         f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
     )
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )
 
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         with endpoint.cursor() as cur:
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index eb020c101f4cd..32ad5381b45be 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -17,9 +17,9 @@
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
     assert_tenant_state,
-    tenant_exists,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_tenant_status_404,
 )
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
@@ -29,7 +29,6 @@
     start_in_background,
     subprocess_capture,
     wait_until,
-    wait_while,
 )
 
 
@@ -269,11 +268,16 @@ def test_tenant_relocation(
 
     env = neon_env_builder.init_start()
 
+    tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
+
     # FIXME: Is this expected?
     env.pageserver.allowed_errors.append(
         ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
     )
 
+    # Needed for detach polling.
+    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
+
     # create folder for remote storage mock
     remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage"
 
@@ -283,9 +287,7 @@ def test_tenant_relocation(
 
     pageserver_http = env.pageserver.http_client()
 
-    tenant_id, initial_timeline_id = env.neon_cli.create_tenant(
-        TenantId("74ee8b079a0e437eb0afea7d26a07209")
-    )
+    _, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
     log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)
 
     env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id)
@@ -469,11 +471,8 @@ def test_tenant_relocation(
         pageserver_http.tenant_detach(tenant_id)
 
         # Wait a little, so that the detach operation has time to finish.
-        wait_while(
-            number_of_iterations=100,
-            interval=1,
-            func=lambda: tenant_exists(pageserver_http, tenant_id),
-        )
+        wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
+
         post_migration_check(ep_main, 500500, old_local_path_main)
         post_migration_check(ep_second, 1001000, old_local_path_second)
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 79a3b353d4586..fee95e5420c9d 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -146,6 +146,11 @@ def test_tenants_attached_after_download(
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
     timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )
+
     for checkpoint_number in range(1, 3):
         with endpoint.cursor() as cur:
             cur.execute(
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 1e8dd3620618e..26caeb8ffb689 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -4,7 +4,6 @@
 import shutil
 import threading
 from pathlib import Path
-from typing import Optional
 
 import pytest
 import requests
@@ -18,6 +17,8 @@
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
+    assert_prefix_empty,
+    poll_for_remote_storage_iterations,
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -27,7 +28,6 @@
 )
 from fixtures.remote_storage import (
     RemoteStorageKind,
-    S3Storage,
     available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -187,10 +187,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
     8. Retry or restart without the failpoint and check the result.
     """
 
-    if remote_storage_kind is not None:
-        neon_env_builder.enable_remote_storage(
-            remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
-        )
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
+    )
 
     env = neon_env_builder.init_start(
         initial_tenant_conf={
@@ -231,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
 
     ps_http.configure_failpoints((failpoint, "return"))
 
-    iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
 
     # These failpoints are earlier than background task is spawned.
     # so they result in api request failure.
@@ -280,14 +279,14 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
                         "remote_storage_s3_request_seconds_count",
                         filter={"request_type": "get_object", "result": "err"},
                     ).value
-                    == 1
+                    == 2  # One is missing tenant deletion mark, second is missing index part
                 )
                 assert (
                     m.query_one(
                         "remote_storage_s3_request_seconds_count",
                         filter={"request_type": "get_object", "result": "ok"},
                     ).value
-                    == 1
+                    == 1  # index part for initial timeline
                 )
     elif check is Check.RETRY_WITHOUT_RESTART:
         # this should succeed
@@ -413,27 +412,6 @@ def test_timeline_resurrection_on_attach(
     assert all([tl["state"] == "Active" for tl in timelines])
 
 
-def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None):
-    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
-    assert neon_env_builder.remote_storage_kind in (
-        RemoteStorageKind.MOCK_S3,
-        RemoteStorageKind.REAL_S3,
-    )
-    # For mypy
-    assert isinstance(neon_env_builder.remote_storage, S3Storage)
-
-    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
-    assert neon_env_builder.remote_storage_client is not None
-    response = neon_env_builder.remote_storage_client.list_objects_v2(
-        Bucket=neon_env_builder.remote_storage.bucket_name,
-        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
-    )
-    objects = response.get("Contents")
-    assert (
-        response["KeyCount"] == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
-
-
 def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
     """
     When deleting a timeline, if we succeed in setting the deleted flag remotely
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 87f076d236927..9132efe79f276 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -869,6 +869,49 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     assert debug_dump_1["config"]["id"] == env.safekeepers[0].id
 
 
+class DummyConsumer(object):
+    def __call__(self, msg):
+        pass
+
+
+def test_start_replication_term(neon_env_builder: NeonEnvBuilder):
+    """
+    Test START_REPLICATION of uncommitted part specifying leader term. It must
+    error if safekeeper switched to different term.
+    """
+
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_start_replication_term")
+    endpoint = env.endpoints.create_start("test_start_replication_term")
+
+    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
+
+    # learn neon timeline from compute
+    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+
+    sk = env.safekeepers[0]
+    sk_http_cli = sk.http_client()
+    tli_status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    timeline_start_lsn = tli_status.timeline_start_lsn
+
+    conn_opts = {
+        "host": "127.0.0.1",
+        "options": f"-c timeline_id={timeline_id} tenant_id={tenant_id}",
+        "port": sk.port.pg,
+        "connection_factory": psycopg2.extras.PhysicalReplicationConnection,
+    }
+    sk_pg_conn = psycopg2.connect(**conn_opts)  # type: ignore
+    with sk_pg_conn.cursor() as cur:
+        # should fail, as first start has term 2
+        cur.start_replication_expert(f"START_REPLICATION {timeline_start_lsn} (term='3')")
+        dummy_consumer = DummyConsumer()
+        with pytest.raises(psycopg2.errors.InternalError_) as excinfo:
+            cur.consume_stream(dummy_consumer)
+        assert "failed to acquire term 3" in str(excinfo.value)
+
+
 # Test auth on WAL service (postgres protocol) ports.
 def test_sk_auth(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.auth_enabled = True
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 28bf5ccfa2fda..5d5cfee12783f 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 28bf5ccfa2fda9677566a25abd450e714d9ed055
+Subproject commit 5d5cfee12783f0989a9c9fe13bb40b5585812568
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 553f2d3618a6d..026d6b093d49e 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 553f2d3618a6d4893bde67f1c065926ee8a3a118
+Subproject commit 026d6b093d49e25cec44dd04598152329ceac027
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 80d161938effe..63b72cf5063f0 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118",
-    "postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055"
+    "postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027",
+    "postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568"
 }