diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 69b5e2262008e..853f5f291910e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -814,7 +814,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.15.0-alpha1 + VM_BUILDER_VERSION: v0.15.4 steps: - name: Checkout diff --git a/Dockerfile b/Dockerfile index 9467e41ae40f1..1c447b2db99a3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -51,6 +51,7 @@ RUN set -e \ --bin safekeeper \ --bin storage_broker \ --bin proxy \ + --bin neon_local \ --locked --release \ && cachepot -s @@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index da5d519ab51fd..1b5e4cf974f88 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -156,7 +156,7 @@ pub async fn get_available_extensions( let ext_index_full = serde_json::from_slice::(&ext_idx_buffer)?; let mut enabled_extensions = ext_index_full.public_extensions; enabled_extensions.extend_from_slice(custom_extensions); - let library_index = ext_index_full.library_index; + let mut library_index = ext_index_full.library_index; let all_extension_data = ext_index_full.extension_data; info!("library_index: {:?}", library_index); @@ -179,6 +179,8 @@ pub async fn get_available_extensions( file_create_tasks.push(tokio::fs::write(control_path, control_contents)); } else { warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file); + // also delete this from library index + library_index.retain(|_, value| value != extension_name); } } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index eff7c93b461af..74e7796be78d6 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { } RoleAction::Create => { let mut query: String = format!( - "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser", + "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser", name.pg_quote() ); info!("role create query: '{}'", &query); diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs new file mode 100644 index 0000000000000..4507efe64701c --- /dev/null +++ b/libs/utils/src/backoff.rs @@ -0,0 +1,188 @@ +use std::fmt::{Debug, Display}; + +use futures::Future; + +pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; +pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; + +pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { + let backoff_duration_seconds = + exponential_backoff_duration_seconds(n, base_increment, max_seconds); + if backoff_duration_seconds > 0.0 { + tracing::info!( + "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task", + ); + tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await; + } +} + +pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { + if n == 0 { + 0.0 + } else { + (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) + } +} + +/// retries passed operation until one of the following conditions are met: +/// Encountered error is considered as permanent (non-retryable) +/// Retries have been exhausted. +/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors +/// When attempts cross `warn_threshold` function starts to emit log warnings. +/// `description` argument is added to log messages. Its value should identify the `op` is doing +pub async fn retry( + mut op: O, + is_permanent: impl Fn(&E) -> bool, + warn_threshold: u32, + max_retries: u32, + description: &str, +) -> Result +where + // Not std::error::Error because anyhow::Error doesnt implement it. + // For context see https://github.com/dtolnay/anyhow/issues/63 + E: Display + Debug, + O: FnMut() -> F, + F: Future>, +{ + let mut attempts = 0; + loop { + let result = op().await; + match result { + Ok(_) => { + if attempts > 0 { + tracing::info!("{description} succeeded after {attempts} retries"); + } + return result; + } + + // These are "permanent" errors that should not be retried. + Err(ref e) if is_permanent(e) => { + return result; + } + // Assume that any other failure might be transient, and the operation might + // succeed if we just keep trying. + Err(err) if attempts < warn_threshold => { + tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}"); + } + Err(err) if attempts < max_retries => { + tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); + } + Err(ref err) => { + // Operation failed `max_attempts` times. Time to give up. + tracing::warn!( + "{description} still failed after {attempts} retries, giving up: {err:?}" + ); + return result; + } + } + // sleep and retry + exponential_backoff( + attempts, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ) + .await; + attempts += 1; + } +} + +#[cfg(test)] +mod tests { + use std::io; + + use tokio::sync::Mutex; + + use super::*; + + #[test] + fn backoff_defaults_produce_growing_backoff_sequence() { + let mut current_backoff_value = None; + + for i in 0..10_000 { + let new_backoff_value = exponential_backoff_duration_seconds( + i, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + ); + + if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { + assert!( + old_backoff_value <= new_backoff_value, + "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" + ) + } + } + + assert_eq!( + current_backoff_value.expect("Should have produced backoff values to compare"), + DEFAULT_MAX_BACKOFF_SECONDS, + "Given big enough of retries, backoff should reach its allowed max value" + ); + } + + #[tokio::test(start_paused = true)] + async fn retry_always_error() { + let count = Mutex::new(0); + let err_result = retry( + || async { + *count.lock().await += 1; + Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other)) + }, + |_e| false, + 1, + 1, + "work", + ) + .await; + + assert!(err_result.is_err()); + + assert_eq!(*count.lock().await, 2); + } + + #[tokio::test(start_paused = true)] + async fn retry_ok_after_err() { + let count = Mutex::new(0); + retry( + || async { + let mut locked = count.lock().await; + if *locked > 1 { + Ok(()) + } else { + *locked += 1; + Err(io::Error::from(io::ErrorKind::Other)) + } + }, + |_e| false, + 2, + 2, + "work", + ) + .await + .unwrap(); + } + + #[tokio::test(start_paused = true)] + async fn dont_retry_permanent_errors() { + let count = Mutex::new(0); + let _ = retry( + || async { + let mut locked = count.lock().await; + if *locked > 1 { + Ok(()) + } else { + *locked += 1; + Err(io::Error::from(io::ErrorKind::Other)) + } + }, + |_e| true, + 2, + 2, + "work", + ) + .await + .unwrap_err(); + + assert_eq!(*count.lock().await, 1); + } +} diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 090912d2765da..dfb7d5abbfa48 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -24,6 +24,20 @@ pub async fn is_directory_empty(path: impl AsRef) -> anyhow::Result Ok(dir.next_entry().await?.is_none()) } +pub async fn list_dir(path: impl AsRef) -> anyhow::Result> { + let mut dir = tokio::fs::read_dir(&path) + .await + .context(format!("read_dir({})", path.as_ref().display()))?; + + let mut content = vec![]; + while let Some(next) = dir.next_entry().await? { + let file_name = next.file_name(); + content.push(file_name.to_string_lossy().to_string()); + } + + Ok(content) +} + pub fn ignore_not_found(e: io::Error) -> io::Result<()> { if e.kind() == io::ErrorKind::NotFound { Ok(()) @@ -43,7 +57,7 @@ where mod test { use std::path::PathBuf; - use crate::fs_ext::is_directory_empty; + use crate::fs_ext::{is_directory_empty, list_dir}; use super::ignore_absent_files; @@ -109,4 +123,25 @@ mod test { assert!(!file_path.exists()); } + + #[tokio::test] + async fn list_dir_works() { + let dir = tempfile::tempdir().unwrap(); + let dir_path = dir.path(); + + assert!(list_dir(dir_path).await.unwrap().is_empty()); + + let file_path: PathBuf = dir_path.join("testfile"); + let _ = std::fs::File::create(&file_path).unwrap(); + + assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]); + + let another_dir_path: PathBuf = dir_path.join("testdir"); + std::fs::create_dir(another_dir_path).unwrap(); + + let expected = &["testdir", "testfile"]; + let mut actual = list_dir(dir_path).await.unwrap(); + actual.sort(); + assert_eq!(actual, expected); + } } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index b591cc611aa52..5c8e01f41a513 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -1,6 +1,8 @@ //! `utils` is intended to be a place to put code that is shared //! between other crates in this repository. +pub mod backoff; + /// `Lsn` type implements common tasks on Log Sequence Numbers pub mod lsn; /// SeqWait allows waiting for a future sequence number to arrive diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index e3ddefc661978..2d178a42bd2d5 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -72,7 +72,7 @@ async fn read_delta_file(path: impl AsRef) -> Result<()> { .await?; let cursor = BlockCursor::new(&file); for (k, v) in all { - let value = cursor.read_blob(v.pos())?; + let value = cursor.read_blob(v.pos()).await?; println!("key:{} value_len:{}", k, value.len()); } // TODO(chi): special handling for last key? diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index a86993d7f3907..e0c969279ee7d 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -373,7 +373,7 @@ fn start_pageserver( let order = pageserver::InitializationOrder { initial_tenant_load: Some(init_done_tx), initial_logical_size_can_start: init_done_rx.clone(), - initial_logical_size_attempt: init_logical_size_done_tx, + initial_logical_size_attempt: Some(init_logical_size_done_tx), background_jobs_can_start: background_jobs_barrier.clone(), }; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index be806c77ec05e..f2aa2f365eb4c 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -31,7 +31,9 @@ use utils::{ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; -use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; +use crate::tenant::{ + TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, +}; use crate::{ IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, @@ -613,6 +615,11 @@ impl PageServerConf { ) } + pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf { + self.tenant_path(tenant_id) + .join(TENANT_DELETED_MARKER_FILE_NAME) + } + pub fn traces_path(&self) -> PathBuf { self.workdir.join("traces") } diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index a1a5c30ae9aeb..2953208d1e9f2 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -85,6 +85,7 @@ //! The solution is that all code paths are infected with precisely one //! [`RequestContext`] argument. Functions in the middle of the call chain //! only need to pass it on. + use crate::task_mgr::TaskKind; // The main structure of this module, see module-level comment. @@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind; pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, + access_stats_behavior: AccessStatsBehavior, } /// Desired behavior if the operation requires an on-demand download @@ -109,6 +111,67 @@ pub enum DownloadBehavior { Error, } +/// Whether this request should update access times used in LRU eviction +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub(crate) enum AccessStatsBehavior { + /// Update access times: this request's access to data should be taken + /// as a hint that the accessed layer is likely to be accessed again + Update, + + /// Do not update access times: this request is accessing the layer + /// but does not want to indicate that the layer should be retained in cache, + /// perhaps because the requestor is a compaction routine that will soon cover + /// this layer with another. + Skip, +} + +pub struct RequestContextBuilder { + inner: RequestContext, +} + +impl RequestContextBuilder { + /// A new builder with default settings + pub fn new(task_kind: TaskKind) -> Self { + Self { + inner: RequestContext { + task_kind, + download_behavior: DownloadBehavior::Download, + access_stats_behavior: AccessStatsBehavior::Update, + }, + } + } + + pub fn extend(original: &RequestContext) -> Self { + Self { + // This is like a Copy, but avoid implementing Copy because ordinary users of + // RequestContext should always move or ref it. + inner: RequestContext { + task_kind: original.task_kind, + download_behavior: original.download_behavior, + access_stats_behavior: original.access_stats_behavior, + }, + } + } + + /// Configure the DownloadBehavior of the context: whether to + /// download missing layers, and/or warn on the download. + pub fn download_behavior(mut self, b: DownloadBehavior) -> Self { + self.inner.download_behavior = b; + self + } + + /// Configure the AccessStatsBehavior of the context: whether layer + /// accesses should update the access time of the layer. + pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self { + self.inner.access_stats_behavior = b; + self + } + + pub fn build(self) -> RequestContext { + self.inner + } +} + impl RequestContext { /// Create a new RequestContext that has no parent. /// @@ -123,10 +186,9 @@ impl RequestContext { /// because someone explicitly canceled it. /// It has no parent, so it cannot inherit cancellation from there. pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { - RequestContext { - task_kind, - download_behavior, - } + RequestContextBuilder::new(task_kind) + .download_behavior(download_behavior) + .build() } /// Create a detached child context for a task that may outlive `self`. @@ -187,10 +249,7 @@ impl RequestContext { } fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self { - RequestContext { - task_kind, - download_behavior, - } + Self::new(task_kind, download_behavior) } pub fn task_kind(&self) -> TaskKind { @@ -200,4 +259,8 @@ impl RequestContext { pub fn download_behavior(&self) -> DownloadBehavior { self.download_behavior } + + pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior { + self.access_stats_behavior + } } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 290492203a2b1..38e07f172d3d7 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -93,6 +93,47 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + delete: + description: | + Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved. + 404 means that deletion successfully finished" + responses: + "400": + description: Error when no tenant id found in path + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "401": + description: Unauthorized Error + content: + application/json: + schema: + $ref: "#/components/schemas/UnauthorizedError" + "403": + description: Forbidden Error + content: + application/json: + schema: + $ref: "#/components/schemas/ForbiddenError" + "404": + description: Tenant not found + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + "409": + description: Deletion is already in progress, continue polling + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline: parameters: @@ -820,6 +861,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + /v1/tenant/config: put: description: | diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c10223beedefa..1e8dada85eb10 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -187,7 +187,7 @@ impl From for ApiError { format!("Cannot delete timeline which has child timelines: {children:?}") .into_boxed_str(), ), - a @ AlreadyInProgress => ApiError::Conflict(a.to_string()), + a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()), Other(e) => ApiError::InternalServerError(e), } } @@ -208,6 +208,19 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(value: crate::tenant::delete::DeleteTenantError) -> Self { + use crate::tenant::delete::DeleteTenantError::*; + match value { + Get(g) => ApiError::from(g), + e @ AlreadyInProgress => ApiError::Conflict(e.to_string()), + Timeline(t) => ApiError::from(t), + Other(o) => ApiError::InternalServerError(o), + e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()), + } + } +} + // Helper function to construct a TimelineInfo struct for a timeline async fn build_timeline_info( timeline: &Arc, @@ -617,6 +630,23 @@ async fn tenant_status( json_response(StatusCode::OK, tenant_info) } +async fn tenant_delete_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + // TODO openapi spec + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + + let state = get_state(&request); + + mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id) + .instrument(info_span!("tenant_delete_handler", %tenant_id)) + .await?; + + json_response(StatusCode::ACCEPTED, ()) +} + /// HTTP endpoint to query the current tenant_size of a tenant. /// /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used @@ -1345,6 +1375,9 @@ pub fn make_router( .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) + .delete("/v1/tenant/:tenant_id", |r| { + api_handler(r, tenant_delete_handler) + }) .get("/v1/tenant/:tenant_id/synthetic_size", |r| { api_handler(r, tenant_size_handler) }) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index a3a9497d57f33..4ff8ffcc4fc8b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -95,28 +95,6 @@ pub async fn shutdown_pageserver(exit_code: i32) { std::process::exit(exit_code); } -const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1; -const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0; - -async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) { - let backoff_duration_seconds = - exponential_backoff_duration_seconds(n, base_increment, max_seconds); - if backoff_duration_seconds > 0.0 { - info!( - "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task", - ); - tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await; - } -} - -pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 { - if n == 0 { - 0.0 - } else { - (1.0 + base_increment).powf(f64::from(n)).min(max_seconds) - } -} - /// The name of the metadata file pageserver creates per timeline. /// Full path: `tenants//timelines//metadata`. pub const METADATA_FILE_NAME: &str = "metadata"; @@ -190,7 +168,7 @@ pub struct InitializationOrder { /// Each timeline owns a clone of this to be consumed on the initial logical size calculation /// attempt. It is important to drop this once the attempt has completed. - pub initial_logical_size_attempt: utils::completion::Completion, + pub initial_logical_size_attempt: Option, /// Barrier for when we can start any background jobs. /// @@ -226,6 +204,7 @@ async fn timed( let ret = fut.await; + // this has a global allowed_errors tracing::warn!( task = name, elapsed_ms = started.elapsed().as_millis(), @@ -237,37 +216,6 @@ async fn timed( } } -#[cfg(test)] -mod backoff_defaults_tests { - use super::*; - - #[test] - fn backoff_defaults_produce_growing_backoff_sequence() { - let mut current_backoff_value = None; - - for i in 0..10_000 { - let new_backoff_value = exponential_backoff_duration_seconds( - i, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, - ); - - if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) { - assert!( - old_backoff_value <= new_backoff_value, - "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}" - ) - } - } - - assert_eq!( - current_backoff_value.expect("Should have produced backoff values to compare"), - DEFAULT_MAX_BACKOFF_SECONDS, - "Given big enough of retries, backoff should reach its allowed max value" - ); - } -} - #[cfg(test)] mod timed_tests { use super::timed; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a9885a5f700c1..524e3a60a1c07 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -28,6 +28,7 @@ use std::cmp::min; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; +use std::fmt::Debug; use std::fs; use std::fs::File; use std::fs::OpenOptions; @@ -46,8 +47,10 @@ use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; use self::config::TenantConf; +use self::delete::DeleteTenantFlow; use self::metadata::LoadMetadataError; use self::metadata::TimelineMetadata; +use self::mgr::TenantsMap; use self::remote_timeline_client::RemoteTimelineClient; use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; @@ -106,6 +109,7 @@ macro_rules! pausable_failpoint { pub mod blob_io; pub mod block_io; + pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; @@ -118,6 +122,7 @@ mod remote_timeline_client; pub mod storage_layer; pub mod config; +pub mod delete; pub mod mgr; pub mod tasks; pub mod upload_queue; @@ -145,6 +150,8 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching"; +pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; + /// /// Tenant consists of multiple timelines. Keep them in a hash table. /// @@ -183,6 +190,8 @@ pub struct Tenant { cached_synthetic_tenant_size: Arc, eviction_task_tenant_state: tokio::sync::Mutex, + + pub(crate) delete_progress: Arc>, } // We should not blindly overwrite local metadata with remote one. @@ -274,7 +283,7 @@ pub enum LoadLocalTimelineError { ResumeDeletion(#[source] anyhow::Error), } -#[derive(Debug, thiserror::Error)] +#[derive(thiserror::Error)] pub enum DeleteTimelineError { #[error("NotFound")] NotFound, @@ -283,17 +292,37 @@ pub enum DeleteTimelineError { HasChildren(Vec), #[error("Timeline deletion is already in progress")] - AlreadyInProgress, + AlreadyInProgress(Arc>), #[error(transparent)] Other(#[from] anyhow::Error), } +impl Debug for DeleteTimelineError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NotFound => write!(f, "NotFound"), + Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(), + Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(), + Self::Other(e) => f.debug_tuple("Other").field(e).finish(), + } + } +} + pub enum SetStoppingError { AlreadyStopping(completion::Barrier), Broken, } +impl Debug for SetStoppingError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(), + Self::Broken => write!(f, "Broken"), + } + } +} + struct RemoteStartupData { index_part: IndexPart, remote_metadata: TimelineMetadata, @@ -616,7 +645,7 @@ impl Tenant { // For every timeline, download the metadata file, scan the local directory, // and build a layer map that contains an entry for each remote and local // layer file. - let sorted_timelines = tree_sort_timelines(timeline_ancestors)?; + let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?; for (timeline_id, remote_metadata) in sorted_timelines { let (index_part, remote_client) = remote_index_and_client .remove(&timeline_id) @@ -740,12 +769,13 @@ impl Tenant { /// If the loading fails for some reason, the Tenant will go into Broken /// state. #[instrument(skip_all, fields(tenant_id=%tenant_id))] - pub fn spawn_load( + pub(crate) fn spawn_load( conf: &'static PageServerConf, tenant_id: TenantId, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, init_order: Option, + tenants: &'static tokio::sync::RwLock, ctx: &RequestContext, ) -> Arc { span::debug_assert_current_span_has_tenant_id(); @@ -765,7 +795,7 @@ impl Tenant { tenant_conf, wal_redo_manager, tenant_id, - remote_storage, + remote_storage.clone(), ); let tenant = Arc::new(tenant); @@ -781,27 +811,83 @@ impl Tenant { "initial tenant load", false, async move { + let make_broken = |t: &Tenant, err: anyhow::Error| { + error!("load failed, setting tenant state to Broken: {err:?}"); + t.state.send_modify(|state| { + assert!( + matches!(*state, TenantState::Loading | TenantState::Stopping { .. }), + "the loading task owns the tenant state until activation is complete" + ); + *state = TenantState::broken_from_reason(err.to_string()); + }); + }; + let mut init_order = init_order; // take the completion because initial tenant loading will complete when all of // these tasks complete. - let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take()); + let _completion = init_order + .as_mut() + .and_then(|x| x.initial_tenant_load.take()); + + // Dont block pageserver startup on figuring out deletion status + let pending_deletion = { + match DeleteTenantFlow::should_resume_deletion( + conf, + remote_storage.as_ref(), + &tenant_clone, + ) + .await + { + Ok(should_resume_deletion) => should_resume_deletion, + Err(err) => { + make_broken(&tenant_clone, anyhow::anyhow!(err)); + return Ok(()); + } + } + }; + + info!("pending deletion {}", pending_deletion.is_some()); + + if let Some(deletion) = pending_deletion { + // as we are no longer loading, signal completion by dropping + // the completion while we resume deletion + drop(_completion); + // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout + let _ = init_order + .as_mut() + .and_then(|x| x.initial_logical_size_attempt.take()); + + match DeleteTenantFlow::resume( + deletion, + &tenant_clone, + init_order.as_ref(), + tenants, + &ctx, + ) + .await + { + Err(err) => { + make_broken(&tenant_clone, anyhow::anyhow!(err)); + return Ok(()); + } + Ok(()) => return Ok(()), + } + } + + let background_jobs_can_start = + init_order.as_ref().map(|x| &x.background_jobs_can_start); match tenant_clone.load(init_order.as_ref(), &ctx).await { Ok(()) => { - debug!("load finished, activating"); - let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start); + debug!("load finished",); + tenant_clone.activate(broker_client, background_jobs_can_start, &ctx); } - Err(err) => { - error!("load failed, setting tenant state to Broken: {err:?}"); - tenant_clone.state.send_modify(|state| { - assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete"); - *state = TenantState::broken_from_reason(err.to_string()); - }); - } + Err(err) => make_broken(&tenant_clone, err), } - Ok(()) + + Ok(()) } .instrument({ let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id); @@ -877,6 +963,8 @@ impl Tenant { ) })?; + info!("Found deletion mark for timeline {}", timeline_id); + match load_metadata(self.conf, &self.tenant_id, &timeline_id) { Ok(metadata) => { timelines_to_resume_deletion.push((timeline_id, Some(metadata))) @@ -966,9 +1054,11 @@ impl Tenant { // Sort the array of timeline IDs into tree-order, so that parent comes before // all its children. - tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan { - sorted_timelines_to_load: sorted_timelines, - timelines_to_resume_deletion, + tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| { + TenantDirectoryScan { + sorted_timelines_to_load: sorted_timelines, + timelines_to_resume_deletion, + } }) } @@ -1682,7 +1772,7 @@ impl Tenant { // It's mesed up. // we just ignore the failure to stop - match self.set_stopping(shutdown_progress).await { + match self.set_stopping(shutdown_progress, false).await { Ok(()) => {} Err(SetStoppingError::Broken) => { // assume that this is acceptable @@ -1722,18 +1812,25 @@ impl Tenant { /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state. /// /// This function is not cancel-safe! - async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> { + /// + /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant. + async fn set_stopping( + &self, + progress: completion::Barrier, + allow_transition_from_loading: bool, + ) -> Result<(), SetStoppingError> { let mut rx = self.state.subscribe(); // cannot stop before we're done activating, so wait out until we're done activating rx.wait_for(|state| match state { - TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + TenantState::Activating(_) | TenantState::Attaching => { info!( "waiting for {} to turn Active|Broken|Stopping", <&'static str>::from(state) ); false } + TenantState::Loading => allow_transition_from_loading, TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true, }) .await @@ -1742,9 +1839,16 @@ impl Tenant { // we now know we're done activating, let's see whether this task is the winner to transition into Stopping let mut err = None; let stopping = self.state.send_if_modified(|current_state| match current_state { - TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { + TenantState::Activating(_) | TenantState::Attaching => { unreachable!("we ensured above that we're done with activation, and, there is no re-activation") } + TenantState::Loading => { + if !allow_transition_from_loading { + unreachable!("we ensured above that we're done with activation, and, there is no re-activation") + }; + *current_state = TenantState::Stopping { progress }; + true + } TenantState::Active => { // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines // are created after the transition to Stopping. That's harmless, as the Timelines @@ -1813,6 +1917,10 @@ impl Tenant { .expect("cannot drop self.state while on a &self method"); // we now know we're done activating, let's see whether this task is the winner to transition into Broken + self.set_broken_no_wait(reason) + } + + pub(crate) fn set_broken_no_wait(&self, reason: String) { self.state.send_modify(|current_state| { match *current_state { TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => { @@ -1878,22 +1986,28 @@ impl Tenant { /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), /// perform a topological sort, so that the parent of each timeline comes /// before the children. -fn tree_sort_timelines( - timelines: HashMap, -) -> anyhow::Result> { +/// E extracts the ancestor from T +/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc. +fn tree_sort_timelines( + timelines: HashMap, + extractor: E, +) -> anyhow::Result> +where + E: Fn(&T) -> Option, +{ let mut result = Vec::with_capacity(timelines.len()); let mut now = Vec::with_capacity(timelines.len()); // (ancestor, children) - let mut later: HashMap> = + let mut later: HashMap> = HashMap::with_capacity(timelines.len()); - for (timeline_id, metadata) in timelines { - if let Some(ancestor_id) = metadata.ancestor_timeline() { + for (timeline_id, value) in timelines { + if let Some(ancestor_id) = extractor(&value) { let children = later.entry(ancestor_id).or_default(); - children.push((timeline_id, metadata)); + children.push((timeline_id, value)); } else { - now.push((timeline_id, metadata)); + now.push((timeline_id, value)); } } @@ -2062,7 +2176,7 @@ impl Tenant { remote_client, pg_version, initial_logical_size_can_start.cloned(), - initial_logical_size_attempt.cloned(), + initial_logical_size_attempt.cloned().flatten(), state, ); @@ -2146,6 +2260,7 @@ impl Tenant { cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), + delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), } } @@ -2162,6 +2277,7 @@ impl Tenant { // FIXME If the config file is not found, assume that we're attaching // a detached tenant and config is passed via attach command. // https://github.com/neondatabase/neon/issues/1555 + // OR: we're loading after incomplete deletion that managed to remove config. if !target_config_path.exists() { info!("tenant config not found in {target_config_display}"); return Ok(TenantConfOpt::default()); diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 7dd53407e73f9..fd211e8276606 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -21,14 +21,14 @@ where R: BlockReader, { /// Read a blob into a new buffer. - pub fn read_blob(&self, offset: u64) -> Result, std::io::Error> { + pub async fn read_blob(&self, offset: u64) -> Result, std::io::Error> { let mut buf = Vec::new(); - self.read_blob_into_buf(offset, &mut buf)?; + self.read_blob_into_buf(offset, &mut buf).await?; Ok(buf) } /// Read blob into the given buffer. Any previous contents in the buffer /// are overwritten. - pub fn read_blob_into_buf( + pub async fn read_blob_into_buf( &self, offset: u64, dstbuf: &mut Vec, diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs new file mode 100644 index 0000000000000..38fc31f69c1cc --- /dev/null +++ b/pageserver/src/tenant/delete.rs @@ -0,0 +1,559 @@ +use std::{ + path::{Path, PathBuf}, + sync::Arc, +}; + +use anyhow::Context; +use pageserver_api::models::TenantState; +use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; +use tokio::sync::OwnedMutexGuard; +use tracing::{error, info, instrument, warn, Instrument, Span}; + +use utils::{ + backoff, completion, crashsafe, fs_ext, + id::{TenantId, TimelineId}, +}; + +use crate::{ + config::PageServerConf, + context::RequestContext, + task_mgr::{self, TaskKind}, + InitializationOrder, +}; + +use super::{ + mgr::{GetTenantError, TenantsMap}, + remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, + span, + timeline::delete::DeleteTimelineFlow, + tree_sort_timelines, DeleteTimelineError, Tenant, +}; + +const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3; + +#[derive(Debug, thiserror::Error)] +pub enum DeleteTenantError { + #[error("GetTenant {0}")] + Get(#[from] GetTenantError), + + #[error("Invalid state {0}. Expected Active or Broken")] + InvalidState(TenantState), + + #[error("Tenant deletion is already in progress")] + AlreadyInProgress, + + #[error("Timeline {0}")] + Timeline(#[from] DeleteTimelineError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +type DeletionGuard = tokio::sync::OwnedMutexGuard; + +fn remote_tenant_delete_mark_path( + conf: &PageServerConf, + tenant_id: &TenantId, +) -> anyhow::Result { + let tenant_remote_path = conf + .tenant_path(tenant_id) + .strip_prefix(&conf.workdir) + .context("Failed to strip workdir prefix") + .and_then(RemotePath::new) + .context("tenant path")?; + Ok(tenant_remote_path.join(Path::new("deleted"))) +} + +async fn create_remote_delete_mark( + conf: &PageServerConf, + remote_storage: &GenericRemoteStorage, + tenant_id: &TenantId, +) -> Result<(), DeleteTenantError> { + let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?; + + let data: &[u8] = &[]; + backoff::retry( + || async { + remote_storage + .upload(data, 0, &remote_mark_path, None) + .await + }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "mark_upload", + ) + .await + .context("mark_upload")?; + + Ok(()) +} + +async fn create_local_delete_mark( + conf: &PageServerConf, + tenant_id: &TenantId, +) -> Result<(), DeleteTenantError> { + let marker_path = conf.tenant_deleted_mark_file_path(tenant_id); + + // Note: we're ok to replace existing file. + let _ = std::fs::OpenOptions::new() + .write(true) + .create(true) + .open(&marker_path) + .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; + + crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?; + + Ok(()) +} + +async fn schedule_ordered_timeline_deletions( + tenant: &Arc, +) -> Result>, TimelineId)>, DeleteTenantError> { + // Tenant is stopping at this point. We know it will be deleted. + // No new timelines should be created. + // Tree sort timelines to delete from leafs to the root. + // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion + // can complete and remove timeline from the map in between our call to clone + // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map. + // timelines.lock is currently synchronous so we cant hold it across await point. + // So just ignore NotFound error if we get it from `run`. + // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock. + let timelines = tenant.timelines.lock().unwrap().clone(); + let sorted = + tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?; + + let mut already_running_deletions = vec![]; + + for (timeline_id, _) in sorted.into_iter().rev() { + if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await { + match e { + DeleteTimelineError::NotFound => { + // Timeline deletion finished after call to clone above but before call + // to `DeleteTimelineFlow::run` and removed timeline from the map. + continue; + } + DeleteTimelineError::AlreadyInProgress(guard) => { + already_running_deletions.push((guard, timeline_id)); + continue; + } + e => return Err(DeleteTenantError::Timeline(e)), + } + } + } + + Ok(already_running_deletions) +} + +async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> { + // Assert timelines dir is empty. + if !fs_ext::is_directory_empty(timelines_path).await? { + // Display first 10 items in directory + let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10]; + return Err(DeleteTenantError::Other(anyhow::anyhow!( + "Timelines directory is not empty after all timelines deletion: {list:?}" + ))); + } + + Ok(()) +} + +async fn remove_tenant_remote_delete_mark( + conf: &PageServerConf, + remote_storage: Option<&GenericRemoteStorage>, + tenant_id: &TenantId, +) -> Result<(), DeleteTenantError> { + if let Some(remote_storage) = remote_storage { + let path = remote_tenant_delete_mark_path(conf, tenant_id)?; + backoff::retry( + || async { remote_storage.delete(&path).await }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "remove_tenant_remote_delete_mark", + ) + .await + .context("remove_tenant_remote_delete_mark")?; + } + Ok(()) +} + +// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir +async fn cleanup_remaining_fs_traces( + conf: &PageServerConf, + tenant_id: &TenantId, +) -> Result<(), DeleteTenantError> { + let rm = |p: PathBuf, is_dir: bool| async move { + if is_dir { + tokio::fs::remove_dir(&p).await + } else { + tokio::fs::remove_file(&p).await + } + .or_else(fs_ext::ignore_not_found) + .with_context(|| { + let to_display = p.display(); + format!("failed to delete {to_display}") + }) + }; + + rm(conf.tenant_config_path(tenant_id), false).await?; + + fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-remove-timelines-dir" + ))? + }); + + rm(conf.timelines_path(tenant_id), true).await?; + + fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-remove-deleted-mark" + ))? + }); + + rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?; + + fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-remove-tenant-dir" + ))? + }); + + rm(conf.tenant_path(tenant_id), true).await?; + + Ok(()) +} + +/// Orchestrates tenant shut down of all tasks, removes its in-memory structures, +/// and deletes its data from both disk and s3. +/// The sequence of steps: +/// 1. Upload remote deletion mark. +/// 2. Create local mark file. +/// 3. Shutdown tasks +/// 4. Run ordered timeline deletions +/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested +/// 6. Remove remote mark +/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark +/// It is resumable from any step in case a crash/restart occurs. +/// There are three entrypoints to the process: +/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler. +/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there. +/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function. +#[derive(Default)] +pub enum DeleteTenantFlow { + #[default] + NotStarted, + InProgress, + Finished, +} + +impl DeleteTenantFlow { + // These steps are run in the context of management api request handler. + // Long running steps are continued to run in the background. + // NB: If this fails half-way through, and is retried, the retry will go through + // all the same steps again. Make sure the code here is idempotent, and don't + // error out if some of the shutdown tasks have already been completed! + // NOTE: static needed for background part. + // We assume that calling code sets up the span with tenant_id. + #[instrument(skip_all)] + pub(crate) async fn run( + conf: &'static PageServerConf, + remote_storage: Option, + tenants: &'static tokio::sync::RwLock, + tenant_id: TenantId, + ) -> Result<(), DeleteTenantError> { + span::debug_assert_current_span_has_tenant_id(); + + let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?; + + if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { + tenant.set_broken(format!("{e:#}")).await; + return Err(e); + } + + Self::schedule_background(guard, conf, remote_storage, tenants, tenant); + + Ok(()) + } + + // Helper function needed to be able to match once on returned error and transition tenant into broken state. + // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown + // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried. + // So the solution is to set tenant state to broken. + async fn run_inner( + guard: &mut OwnedMutexGuard, + conf: &'static PageServerConf, + remote_storage: Option<&GenericRemoteStorage>, + tenant: &Tenant, + ) -> Result<(), DeleteTenantError> { + guard.mark_in_progress()?; + + fail::fail_point!("tenant-delete-before-create-remote-mark", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-create-remote-mark" + ))? + }); + + // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend. + // Though sounds scary, different mark name? + // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. + if let Some(remote_storage) = &remote_storage { + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id) + .await + .context("remote_mark")? + } + + fail::fail_point!("tenant-delete-before-create-local-mark", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-create-local-mark" + ))? + }); + + create_local_delete_mark(conf, &tenant.tenant_id) + .await + .context("local delete mark")?; + + fail::fail_point!("tenant-delete-before-background", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-background" + ))? + }); + + Ok(()) + } + + fn mark_in_progress(&mut self) -> anyhow::Result<()> { + match self { + Self::Finished => anyhow::bail!("Bug. Is in finished state"), + Self::InProgress { .. } => { /* We're in a retry */ } + Self::NotStarted => { /* Fresh start */ } + } + + *self = Self::InProgress; + + Ok(()) + } + + pub async fn should_resume_deletion( + conf: &'static PageServerConf, + remote_storage: Option<&GenericRemoteStorage>, + tenant: &Tenant, + ) -> Result, DeleteTenantError> { + let acquire = |t: &Tenant| { + Some( + Arc::clone(&t.delete_progress) + .try_lock_owned() + .expect("we're the only owner during init"), + ) + }; + + let tenant_id = tenant.tenant_id; + // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. + if conf.tenant_deleted_mark_file_path(&tenant_id).exists() { + return Ok(acquire(tenant)); + } + + let remote_storage = match remote_storage { + Some(remote_storage) => remote_storage, + None => return Ok(None), + }; + + // If remote storage is there we rely on it + let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?; + + let result = backoff::retry( + || async { remote_storage.download(&remote_mark_path).await }, + |e| matches!(e, DownloadError::NotFound), + SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS, + SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS, + "fetch_tenant_deletion_mark", + ) + .await; + + match result { + Ok(_) => Ok(acquire(tenant)), + Err(DownloadError::NotFound) => Ok(None), + Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?, + } + } + + pub(crate) async fn resume( + guard: DeletionGuard, + tenant: &Arc, + init_order: Option<&InitializationOrder>, + tenants: &'static tokio::sync::RwLock, + ctx: &RequestContext, + ) -> Result<(), DeleteTenantError> { + let (_, progress) = completion::channel(); + + tenant + .set_stopping(progress, true) + .await + .expect("cant be stopping or broken"); + + // Do not consume valuable resources during the load phase, continue deletion once init phase is complete. + let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start); + if let Some(background) = background_jobs_can_start { + info!("waiting for backgound jobs barrier"); + background.clone().wait().await; + info!("ready for backgound jobs barrier"); + } + + // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir) + let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id); + if timelines_path.exists() { + tenant.load(init_order, ctx).await.context("load")?; + } + + Self::background( + guard, + tenant.conf, + tenant.remote_storage.clone(), + tenants, + tenant, + ) + .await + } + + async fn prepare( + tenants: &tokio::sync::RwLock, + tenant_id: TenantId, + ) -> Result<(Arc, tokio::sync::OwnedMutexGuard), DeleteTenantError> { + let m = tenants.read().await; + + let tenant = m + .get(&tenant_id) + .ok_or(GetTenantError::NotFound(tenant_id))?; + + // FIXME: unsure about active only. Our init jobs may not be cancellable properly, + // so at least for now allow deletions only for active tenants. TODO recheck + // Broken and Stopping is needed for retries. + if !matches!( + tenant.current_state(), + TenantState::Active | TenantState::Broken { .. } + ) { + return Err(DeleteTenantError::InvalidState(tenant.current_state())); + } + + let guard = Arc::clone(&tenant.delete_progress) + .try_lock_owned() + .map_err(|_| DeleteTenantError::AlreadyInProgress)?; + + fail::fail_point!("tenant-delete-before-shutdown", |_| { + Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))? + }); + + // make pageserver shutdown not to wait for our completion + let (_, progress) = completion::channel(); + + // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent. + // i e it is an error to do: + // tenant.set_stopping + // tenant.shutdown + // Its also bad that we're holding tenants.read here. + // TODO relax set_stopping to be idempotent? + if tenant.shutdown(progress, false).await.is_err() { + return Err(DeleteTenantError::Other(anyhow::anyhow!( + "tenant shutdown is already in progress" + ))); + } + + Ok((Arc::clone(tenant), guard)) + } + + fn schedule_background( + guard: OwnedMutexGuard, + conf: &'static PageServerConf, + remote_storage: Option, + tenants: &'static tokio::sync::RwLock, + tenant: Arc, + ) { + let tenant_id = tenant.tenant_id; + + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::TimelineDeletionWorker, + Some(tenant_id), + None, + "tenant_delete", + false, + async move { + if let Err(err) = + Self::background(guard, conf, remote_storage, tenants, &tenant).await + { + error!("Error: {err:#}"); + tenant.set_broken(format!("{err:#}")).await; + }; + Ok(()) + } + .instrument({ + let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id); + span.follows_from(Span::current()); + span + }), + ); + } + + async fn background( + mut guard: OwnedMutexGuard, + conf: &PageServerConf, + remote_storage: Option, + tenants: &'static tokio::sync::RwLock, + tenant: &Arc, + ) -> Result<(), DeleteTenantError> { + // Tree sort timelines, schedule delete for them. Mention retries from the console side. + // Note that if deletion fails we dont mark timelines as broken, + // the whole tenant will become broken as by `Self::schedule_background` logic + let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant) + .await + .context("schedule_ordered_timeline_deletions")?; + + fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-polling-ongoing-deletions" + ))? + }); + + // Wait for deletions that were already running at the moment when tenant deletion was requested. + // When we can lock deletion guard it means that corresponding timeline deletion finished. + for (guard, timeline_id) in already_running_timeline_deletions { + let flow = guard.lock().await; + if !flow.is_finished() { + return Err(DeleteTenantError::Other(anyhow::anyhow!( + "already running timeline deletion failed: {timeline_id}" + ))); + } + } + + let timelines_path = conf.timelines_path(&tenant.tenant_id); + // May not exist if we fail in cleanup_remaining_fs_traces after removing it + if timelines_path.exists() { + // sanity check to guard against layout changes + ensure_timelines_dir_empty(&timelines_path) + .await + .context("timelines dir not empty")?; + } + + remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?; + + fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-cleanup-remaining-fs-traces" + ))? + }); + + cleanup_remaining_fs_traces(conf, &tenant.tenant_id) + .await + .context("cleanup_remaining_fs_traces")?; + + let mut locked = tenants.write().await; + if locked.remove(&tenant.tenant_id).is_none() { + warn!("Tenant got removed from tenants map during deletion"); + }; + + *guard = Self::Finished; + + Ok(()) + } +} diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index b163d27b421e5..d0ebd3eb4e69b 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -20,6 +20,7 @@ //! use byteorder::{ReadBytesExt, BE}; use bytes::{BufMut, Bytes, BytesMut}; +use either::Either; use hex; use std::{cmp::Ordering, io, result}; use thiserror::Error; @@ -256,103 +257,77 @@ where where V: FnMut(&[u8], u64) -> bool, { - self.search_recurse(self.root_blk, search_key, dir, &mut visitor) - } - - fn search_recurse( - &self, - node_blknum: u32, - search_key: &[u8; L], - dir: VisitDirection, - visitor: &mut V, - ) -> Result - where - V: FnMut(&[u8], u64) -> bool, - { - // Locate the node. - let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?; - - let node = OnDiskNode::deparse(node_buf.as_ref())?; - let prefix_len = node.prefix_len as usize; - let suffix_len = node.suffix_len as usize; - - assert!(node.num_children > 0); - - let mut keybuf = Vec::new(); - keybuf.extend(node.prefix); - keybuf.resize(prefix_len + suffix_len, 0); - - if dir == VisitDirection::Forwards { - // Locate the first match - let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { - Ok(idx) => idx, - Err(idx) => { - if node.level == 0 { - // Imagine that the node contains the following keys: - // - // 1 - // 3 <-- idx - // 5 - // - // If the search key is '2' and there is exact match, - // the binary search would return the index of key - // '3'. That's cool, '3' is the first key to return. - idx - } else { - // This is an internal page, so each key represents a lower - // bound for what's in the child page. If there is no exact - // match, we have to return the *previous* entry. - // - // 1 <-- return this - // 3 <-- idx - // 5 - idx.saturating_sub(1) + let mut stack = Vec::new(); + stack.push((self.root_blk, None)); + while let Some((node_blknum, opt_iter)) = stack.pop() { + // Locate the node. + let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?; + + let node = OnDiskNode::deparse(node_buf.as_ref())?; + let prefix_len = node.prefix_len as usize; + let suffix_len = node.suffix_len as usize; + + assert!(node.num_children > 0); + + let mut keybuf = Vec::new(); + keybuf.extend(node.prefix); + keybuf.resize(prefix_len + suffix_len, 0); + + let mut iter = if let Some(iter) = opt_iter { + iter + } else if dir == VisitDirection::Forwards { + // Locate the first match + let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { + Ok(idx) => idx, + Err(idx) => { + if node.level == 0 { + // Imagine that the node contains the following keys: + // + // 1 + // 3 <-- idx + // 5 + // + // If the search key is '2' and there is exact match, + // the binary search would return the index of key + // '3'. That's cool, '3' is the first key to return. + idx + } else { + // This is an internal page, so each key represents a lower + // bound for what's in the child page. If there is no exact + // match, we have to return the *previous* entry. + // + // 1 <-- return this + // 3 <-- idx + // 5 + idx.saturating_sub(1) + } } - } - }; - // idx points to the first match now. Keep going from there - let mut key_off = idx * suffix_len; - while idx < node.num_children as usize { - let suffix = &node.keys[key_off..key_off + suffix_len]; - keybuf[prefix_len..].copy_from_slice(suffix); - let value = node.value(idx); - #[allow(clippy::collapsible_if)] - if node.level == 0 { - // leaf - if !visitor(&keybuf, value.to_u64()) { - return Ok(false); + }; + Either::Left(idx..node.num_children.into()) + } else { + let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { + Ok(idx) => { + // Exact match. That's the first entry to return, and walk + // backwards from there. + idx } - } else { - #[allow(clippy::collapsible_if)] - if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? { - return Ok(false); + Err(idx) => { + // No exact match. The binary search returned the index of the + // first key that's > search_key. Back off by one, and walk + // backwards from there. + if let Some(idx) = idx.checked_sub(1) { + idx + } else { + return Ok(false); + } } - } - idx += 1; - key_off += suffix_len; - } - } else { - let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) { - Ok(idx) => { - // Exact match. That's the first entry to return, and walk - // backwards from there. (The loop below starts from 'idx - - // 1', so add one here to compensate.) - idx + 1 - } - Err(idx) => { - // No exact match. The binary search returned the index of the - // first key that's > search_key. Back off by one, and walk - // backwards from there. (The loop below starts from idx - 1, - // so we don't need to subtract one here) - idx - } + }; + Either::Right((0..=idx).rev()) }; - // idx points to the first match + 1 now. Keep going from there. - let mut key_off = idx * suffix_len; - while idx > 0 { - idx -= 1; - key_off -= suffix_len; + // idx points to the first match now. Keep going from there + while let Some(idx) = iter.next() { + let key_off = idx * suffix_len; let suffix = &node.keys[key_off..key_off + suffix_len]; keybuf[prefix_len..].copy_from_slice(suffix); let value = node.value(idx); @@ -363,12 +338,8 @@ where return Ok(false); } } else { - #[allow(clippy::collapsible_if)] - if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? { - return Ok(false); - } - } - if idx == 0 { + stack.push((node_blknum, Some(iter))); + stack.push((value.to_blknum(), None)); break; } } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index b088a3b6028ee..38a491f931c6a 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -401,17 +401,26 @@ mod tests { Ok(()) } - #[test] - fn test_ephemeral_blobs() -> Result<(), io::Error> { + #[tokio::test] + async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?; let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?; let pos_foo = file.write_blob(b"foo")?; - assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); + assert_eq!( + b"foo", + file.block_cursor().read_blob(pos_foo).await?.as_slice() + ); let pos_bar = file.write_blob(b"bar")?; - assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice()); - assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice()); + assert_eq!( + b"foo", + file.block_cursor().read_blob(pos_foo).await?.as_slice() + ); + assert_eq!( + b"bar", + file.block_cursor().read_blob(pos_bar).await?.as_slice() + ); let mut blobs = Vec::new(); for i in 0..10000 { @@ -428,7 +437,7 @@ mod tests { let cursor = BlockCursor::new(&file); for (pos, expected) in blobs { - let actual = cursor.read_blob(pos)?; + let actual = cursor.read_blob(pos).await?; assert_eq!(actual, expected); } @@ -437,7 +446,7 @@ mod tests { large_data.resize(20000, 0); thread_rng().fill_bytes(&mut large_data); let pos_large = file.write_blob(&large_data)?; - let result = file.block_cursor().read_blob(pos_large)?; + let result = file.block_cursor().read_blob(pos_large).await?; assert_eq!(result, large_data); Ok(()) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 2635953e6dabb..ae6d237066c42 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -20,17 +20,19 @@ use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::TenantConfOpt; +use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME}; use utils::fs_ext::PathExt; use utils::id::{TenantId, TimelineId}; +use super::delete::DeleteTenantError; use super::timeline::delete::DeleteTimelineFlow; /// The tenants known to the pageserver. /// The enum variants are used to distinguish the different states that the pageserver can be in. -enum TenantsMap { +pub(crate) enum TenantsMap { /// [`init_tenant_mgr`] is not done yet. Initializing, /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. @@ -42,13 +44,13 @@ enum TenantsMap { } impl TenantsMap { - fn get(&self, tenant_id: &TenantId) -> Option<&Arc> { + pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc> { match self { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id), } } - fn remove(&mut self, tenant_id: &TenantId) -> Option> { + pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option> { match self { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id), @@ -97,7 +99,9 @@ pub async fn init_tenant_mgr( ); } } else { - // This case happens if we crash during attach before creating the attach marker file + // This case happens if we: + // * crash during attach before creating the attach marker file + // * crash during tenant delete before removing tenant directory let is_empty = tenant_dir_path.is_empty_dir().with_context(|| { format!("Failed to check whether {tenant_dir_path:?} is an empty dir") })?; @@ -124,6 +128,7 @@ pub async fn init_tenant_mgr( broker_client.clone(), remote_storage.clone(), Some(init_order.clone()), + &TENANTS, &ctx, ) { Ok(tenant) => { @@ -154,12 +159,13 @@ pub async fn init_tenant_mgr( Ok(()) } -pub fn schedule_local_tenant_processing( +pub(crate) fn schedule_local_tenant_processing( conf: &'static PageServerConf, tenant_path: &Path, broker_client: storage_broker::BrokerClientChannel, remote_storage: Option, init_order: Option, + tenants: &'static tokio::sync::RwLock, ctx: &RequestContext, ) -> anyhow::Result> { anyhow::ensure!( @@ -219,6 +225,7 @@ pub fn schedule_local_tenant_processing( broker_client, remote_storage, init_order, + tenants, ctx, ) }; @@ -356,7 +363,7 @@ pub async fn create_tenant( // See https://github.com/neondatabase/neon/issues/4233 let created_tenant = - schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?; + schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 @@ -417,6 +424,14 @@ pub async fn get_tenant( } } +pub async fn delete_tenant( + conf: &'static PageServerConf, + remote_storage: Option, + tenant_id: TenantId, +) -> Result<(), DeleteTenantError> { + DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await +} + #[derive(Debug, thiserror::Error)] pub enum DeleteTimelineError { #[error("Tenant {0}")] @@ -432,7 +447,7 @@ pub async fn delete_timeline( _ctx: &RequestContext, ) -> Result<(), DeleteTimelineError> { let tenant = get_tenant(tenant_id, true).await?; - DeleteTimelineFlow::run(&tenant, timeline_id).await?; + DeleteTimelineFlow::run(&tenant, timeline_id, false).await?; Ok(()) } @@ -507,7 +522,7 @@ pub async fn load_tenant( .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?; } - let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx) + let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx) .with_context(|| { format!("Failed to schedule tenant processing in path {tenant_path:?}") })?; @@ -588,7 +603,7 @@ pub async fn attach_tenant( .context("check for attach marker file existence")?; anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file"); - let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?; + let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. // See https://github.com/neondatabase/neon/issues/4233 diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 8d002a8570617..1d04d748396b1 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc}; // re-export these pub use download::{is_temp_download_file, list_remote_timelines}; use scopeguard::ScopeGuard; +use utils::backoff::{ + self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, +}; use std::collections::{HashMap, VecDeque}; use std::path::Path; @@ -241,7 +244,6 @@ use crate::{ tenant::upload_queue::{ UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask, }, - {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}, }; use utils::id::{TenantId, TimelineId}; @@ -256,12 +258,12 @@ use super::upload_queue::SetDeletedFlagProgress; // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_DOWNLOAD_RETRIES times, we give up -const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3; -const FAILED_DOWNLOAD_RETRIES: u32 = 10; +pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3; +pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10; // Similarly log failed uploads and deletions at WARN level, after this many // retries. Uploads and deletions are retried forever, though. -const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), @@ -752,12 +754,24 @@ impl RemoteTimelineClient { pausable_failpoint!("persist_deleted_index_part"); - upload::upload_index_part( - self.conf, - &self.storage_impl, - &self.tenant_id, - &self.timeline_id, - &index_part_with_deleted_at, + backoff::retry( + || async { + upload::upload_index_part( + self.conf, + &self.storage_impl, + &self.tenant_id, + &self.timeline_id, + &index_part_with_deleted_at, + ) + .await + }, + |_e| false, + 1, + // have just a couple of attempts + // when executed as part of timeline deletion this happens in context of api call + // when executed as part of tenant deletion this happens in the background + 2, + "persist_index_part_with_deleted_flag", ) .await?; @@ -834,10 +848,19 @@ impl RemoteTimelineClient { let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id); let timeline_storage_path = self.conf.remote_path(&timeline_path)?; - let remaining = self - .storage_impl - .list_prefixes(Some(&timeline_storage_path)) - .await?; + let remaining = backoff::retry( + || async { + self.storage_impl + .list_prefixes(Some(&timeline_storage_path)) + .await + }, + |_e| false, + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "list_prefixes", + ) + .await + .context("list prefixes")?; let remaining: Vec = remaining .into_iter() @@ -852,7 +875,15 @@ impl RemoteTimelineClient { .collect(); if !remaining.is_empty() { - self.storage_impl.delete_objects(&remaining).await?; + backoff::retry( + || async { self.storage_impl.delete_objects(&remaining).await }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "delete_objects", + ) + .await + .context("delete_objects")?; } fail::fail_point!("timeline-delete-before-index-delete", |_| { @@ -864,7 +895,16 @@ impl RemoteTimelineClient { let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME)); debug!("deleting index part"); - self.storage_impl.delete(&index_file_path).await?; + + backoff::retry( + || async { self.storage_impl.delete(&index_file_path).await }, + |_e| false, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "delete_index", + ) + .await + .context("delete_index")?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 64f4a0a113675..0a6fd03887405 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -11,19 +11,17 @@ use std::time::Duration; use anyhow::{anyhow, Context}; use tokio::fs; use tokio::io::AsyncWriteExt; - -use tracing::{info, warn}; +use utils::backoff; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use remote_storage::{DownloadError, GenericRemoteStorage}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use super::index::{IndexPart, LayerFileMetadata}; -use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD}; +use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES}; async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Error> { fs::File::open(path).await?.sync_all().await @@ -268,7 +266,6 @@ pub(super) async fn download_index_part( Ok(index_part) } -/// /// Helper function to handle retries for a download operation. /// /// Remote operations can fail due to rate limits (IAM, S3), spurious network @@ -276,47 +273,17 @@ pub(super) async fn download_index_part( /// with backoff. /// /// (See similar logic for uploads in `perform_upload_task`) -async fn download_retry(mut op: O, description: &str) -> Result +async fn download_retry(op: O, description: &str) -> Result where O: FnMut() -> F, F: Future>, { - let mut attempts = 0; - loop { - let result = op().await; - match result { - Ok(_) => { - if attempts > 0 { - info!("{description} succeeded after {attempts} retries"); - } - return result; - } - - // These are "permanent" errors that should not be retried. - Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => { - return result; - } - // Assume that any other failure might be transient, and the operation might - // succeed if we just keep trying. - Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => { - info!("{description} failed, will retry (attempt {attempts}): {err:#}"); - } - Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => { - warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); - } - Err(DownloadError::Other(ref err)) => { - // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up. - warn!("{description} still failed after {attempts} retries, giving up: {err:?}"); - return result; - } - } - // sleep and retry - exponential_backoff( - attempts, - DEFAULT_BASE_BACKOFF_SECONDS, - DEFAULT_MAX_BACKOFF_SECONDS, - ) - .await; - attempts += 1; - } + backoff::retry( + op, + |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + description, + ) + .await } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 951fa38d8d49c..4107161aa1839 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -8,7 +8,7 @@ mod layer_desc; mod remote_layer; use crate::config::PageServerConf; -use crate::context::RequestContext; +use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Key; use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; @@ -241,10 +241,14 @@ impl LayerAccessStats { }); } - fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) { + fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) { + if ctx.access_stats_behavior() == AccessStatsBehavior::Skip { + return; + } + let this_access = LayerAccessStatFullDetails { when: SystemTime::now(), - task_kind, + task_kind: ctx.task_kind(), access_kind, }; @@ -252,7 +256,7 @@ impl LayerAccessStats { locked.iter_mut().for_each(|inner| { inner.first_access.get_or_insert(this_access); inner.count_by_access_kind[access_kind] += 1; - inner.task_kind_flag |= task_kind; + inner.task_kind_flag |= ctx.task_kind(); inner.last_accesses.write(this_access); }) } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 037f56663d116..725ce7072ca5b 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -51,6 +51,7 @@ use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; use std::sync::Arc; +use tokio::runtime::Handle; use tokio::sync::OnceCell; use tracing::*; @@ -280,7 +281,8 @@ impl Layer for DeltaLayer { // A subroutine to dump a single blob let dump_blob = |blob_ref: BlobRef| -> anyhow::Result { - let buf = cursor.read_blob(blob_ref.pos())?; + // TODO this is not ideal, but on the other hand we are in dumping code... + let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?; let val = Value::des(&buf)?; let desc = match val { Value::Image(img) => { @@ -335,7 +337,6 @@ impl Layer for DeltaLayer { let inner = self .load(LayerAccessKind::GetValueReconstructData, ctx) .await?; - inner .get_value_reconstruct_data(key, lsn_range, reconstruct_state) .await @@ -452,8 +453,7 @@ impl DeltaLayer { access_kind: LayerAccessKind, ctx: &RequestContext, ) -> Result<&Arc> { - self.access_stats - .record_access(access_kind, ctx.task_kind()); + self.access_stats.record_access(access_kind, ctx); // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner()) @@ -909,12 +909,15 @@ impl DeltaLayerInner { let cursor = file.block_cursor(); let mut buf = Vec::new(); for (entry_lsn, pos) in offsets { - cursor.read_blob_into_buf(pos, &mut buf).with_context(|| { - format!( - "Failed to read blob from virtual file {}", - file.file.path.display() - ) - })?; + cursor + .read_blob_into_buf(pos, &mut buf) + .await + .with_context(|| { + format!( + "Failed to read blob from virtual file {}", + file.file.path.display() + ) + })?; let val = Value::des(&buf).with_context(|| { format!( "Failed to deserialize file blob from virtual file {}", @@ -1014,9 +1017,9 @@ pub struct ValueRef> { impl> ValueRef { /// Loads the value from disk - pub fn load(&self) -> Result { + pub async fn load(&self) -> Result { // theoretically we *could* record an access time for each, but it does not really matter - let buf = self.reader.read_blob(self.blob_ref.pos())?; + let buf = self.reader.read_blob(self.blob_ref.pos()).await?; let val = Value::des(&buf)?; Ok(val) } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 2824abba754b4..3fe600e5fffae 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -323,8 +323,7 @@ impl ImageLayer { access_kind: LayerAccessKind, ctx: &RequestContext, ) -> Result<&ImageLayerInner> { - self.access_stats - .record_access(access_kind, ctx.task_kind()); + self.access_stats.record_access(access_kind, ctx); self.inner .get_or_try_init(|| self.load_inner()) .await @@ -471,6 +470,7 @@ impl ImageLayerInner { let blob = file .block_cursor() .read_blob(offset) + .await .with_context(|| format!("failed to read value from offset {}", offset))?; let value = Bytes::from(blob); diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 3d222fcb1ea32..aa9d0884e000c 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -16,6 +16,7 @@ use anyhow::{ensure, Result}; use pageserver_api::models::InMemoryLayerInfo; use std::cell::RefCell; use std::collections::HashMap; +use std::sync::OnceLock; use tracing::*; use utils::{ bin_ser::BeSer, @@ -27,7 +28,7 @@ use utils::{ // while being able to use std::fmt::Write's methods use std::fmt::Write as _; use std::ops::Range; -use std::sync::RwLock; +use tokio::sync::RwLock; use super::{DeltaLayer, DeltaLayerWriter, Layer}; @@ -42,14 +43,16 @@ pub struct InMemoryLayer { tenant_id: TenantId, timeline_id: TimelineId, - /// /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. - /// start_lsn: Lsn, - /// The above fields never change. The parts that do change are in 'inner', - /// and protected by mutex. + /// Frozen layers have an exclusive end LSN. + /// Writes are only allowed when this is `None`. + end_lsn: OnceLock, + + /// The above fields never change, except for `end_lsn`, which is only set once. + /// All other changing parts are in `inner`, and protected by a mutex. inner: RwLock, } @@ -57,21 +60,16 @@ impl std::fmt::Debug for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("InMemoryLayer") .field("start_lsn", &self.start_lsn) + .field("end_lsn", &self.end_lsn) .field("inner", &self.inner) .finish() } } pub struct InMemoryLayerInner { - /// Frozen layers have an exclusive end LSN. - /// Writes are only allowed when this is None - end_lsn: Option, - - /// /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. - /// index: HashMap>, /// The values are stored in a serialized format in this file. @@ -82,15 +80,7 @@ pub struct InMemoryLayerInner { impl std::fmt::Debug for InMemoryLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("InMemoryLayerInner") - .field("end_lsn", &self.end_lsn) - .finish() - } -} - -impl InMemoryLayerInner { - fn assert_writeable(&self) { - assert!(self.end_lsn.is_none()); + f.debug_struct("InMemoryLayerInner").finish() } } @@ -101,13 +91,21 @@ impl InMemoryLayer { pub fn info(&self) -> InMemoryLayerInfo { let lsn_start = self.start_lsn; - let lsn_end = self.inner.read().unwrap().end_lsn; - match lsn_end { - Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end }, - None => InMemoryLayerInfo::Open { lsn_start }, + if let Some(&lsn_end) = self.end_lsn.get() { + InMemoryLayerInfo::Frozen { lsn_start, lsn_end } + } else { + InMemoryLayerInfo::Open { lsn_start } } } + + fn assert_writable(&self) { + assert!(self.end_lsn.get().is_none()); + } + + fn end_lsn_or_max(&self) -> Lsn { + self.end_lsn.get().copied().unwrap_or(Lsn::MAX) + } } #[async_trait::async_trait] @@ -117,14 +115,7 @@ impl Layer for InMemoryLayer { } fn get_lsn_range(&self) -> Range { - let inner = self.inner.read().unwrap(); - - let end_lsn = if let Some(end_lsn) = inner.end_lsn { - end_lsn - } else { - Lsn(u64::MAX) - }; - self.start_lsn..end_lsn + self.start_lsn..self.end_lsn_or_max() } fn is_incremental(&self) -> bool { @@ -134,13 +125,9 @@ impl Layer for InMemoryLayer { /// debugging function to print out the contents of the layer async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> { - let inner = self.inner.read().unwrap(); + let inner = self.inner.read().await; - let end_str = inner - .end_lsn - .as_ref() - .map(Lsn::to_string) - .unwrap_or_default(); + let end_str = self.end_lsn_or_max(); println!( "----- in-memory layer for tli {} LSNs {}-{} ----", @@ -156,7 +143,7 @@ impl Layer for InMemoryLayer { for (key, vec_map) in inner.index.iter() { for (lsn, pos) in vec_map.as_slice() { let mut desc = String::new(); - cursor.read_blob_into_buf(*pos, &mut buf)?; + cursor.read_blob_into_buf(*pos, &mut buf).await?; let val = Value::des(&buf); match val { Ok(Value::Image(img)) => { @@ -194,7 +181,7 @@ impl Layer for InMemoryLayer { ensure!(lsn_range.start >= self.start_lsn); let mut need_image = true; - let inner = self.inner.read().unwrap(); + let inner = self.inner.read().await; let reader = inner.file.block_cursor(); @@ -202,7 +189,7 @@ impl Layer for InMemoryLayer { if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { - let buf = reader.read_blob(*pos)?; + let buf = reader.read_blob(*pos).await?; let value = Value::des(&buf)?; match value { Value::Image(img) => { @@ -236,9 +223,7 @@ impl Layer for InMemoryLayer { impl std::fmt::Display for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let inner = self.inner.read().unwrap(); - - let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); + let end_lsn = self.end_lsn_or_max(); write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) } } @@ -247,8 +232,8 @@ impl InMemoryLayer { /// /// Get layer size on the disk /// - pub fn size(&self) -> Result { - let inner = self.inner.read().unwrap(); + pub async fn size(&self) -> Result { + let inner = self.inner.read().await; Ok(inner.file.size) } @@ -270,8 +255,8 @@ impl InMemoryLayer { timeline_id, tenant_id, start_lsn, + end_lsn: OnceLock::new(), inner: RwLock::new(InMemoryLayerInner { - end_lsn: None, index: HashMap::new(), file, }), @@ -282,10 +267,10 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { + pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); - let mut inner = self.inner.write().unwrap(); - inner.assert_writeable(); + let mut inner = self.inner.write().await; + self.assert_writable(); let off = { SER_BUFFER.with(|x| -> Result<_> { @@ -316,11 +301,11 @@ impl InMemoryLayer { /// Make the layer non-writeable. Only call once. /// Records the end_lsn for non-dropped layers. /// `end_lsn` is exclusive - pub fn freeze(&self, end_lsn: Lsn) { - let mut inner = self.inner.write().unwrap(); + pub async fn freeze(&self, end_lsn: Lsn) { + let inner = self.inner.write().await; assert!(self.start_lsn < end_lsn); - inner.end_lsn = Some(end_lsn); + self.end_lsn.set(end_lsn).expect("end_lsn set only once"); for vec_map in inner.index.values() { for (lsn, _pos) in vec_map.as_slice() { @@ -332,7 +317,7 @@ impl InMemoryLayer { /// Write this frozen in-memory layer to disk. /// /// Returns a new delta layer with all the same data as this in-memory layer - pub fn write_to_disk(&self) -> Result { + pub async fn write_to_disk(&self) -> Result { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -342,14 +327,16 @@ impl InMemoryLayer { // lock, it will see that it's not writeable anymore and retry, but it // would have to wait until we release it. That race condition is very // rare though, so we just accept the potential latency hit for now. - let inner = self.inner.read().unwrap(); + let inner = self.inner.read().await; + + let end_lsn = *self.end_lsn.get().unwrap(); let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_id, Key::MIN, - self.start_lsn..inner.end_lsn.unwrap(), + self.start_lsn..end_lsn, )?; let mut buf = Vec::new(); @@ -363,7 +350,7 @@ impl InMemoryLayer { let key = **key; // Write all page versions for (lsn, pos) in vec_map.as_slice() { - cursor.read_blob_into_buf(*pos, &mut buf)?; + cursor.read_blob_into_buf(*pos, &mut buf).await?; let will_init = Value::des(&buf)?.will_init(); delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?; } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0074933902879..9b7402b7c758a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -35,7 +35,9 @@ use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; -use crate::context::{DownloadBehavior, RequestContext}; +use crate::context::{ + AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder, +}; use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata}; use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::{ @@ -800,10 +802,15 @@ impl Timeline { .await { Ok((partitioning, lsn)) => { + // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them + let image_ctx = RequestContextBuilder::extend(ctx) + .access_stats_behavior(AccessStatsBehavior::Skip) + .build(); + // 2. Create new image layers for partitions that have been modified // "enough". let layer_paths_to_upload = self - .create_image_layers(&partitioning, lsn, false, ctx) + .create_image_layers(&partitioning, lsn, false, &image_ctx) .await .map_err(anyhow::Error::from)?; if let Some(remote_client) = &self.remote_client { @@ -876,7 +883,7 @@ impl Timeline { let Some(open_layer) = layers.open_layer.as_ref() else { return Ok(()); }; - open_layer.size()? + open_layer.size().await? }; let last_freeze_at = self.last_freeze_at.load(); let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); @@ -920,7 +927,7 @@ impl Timeline { pub fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { - warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); + info!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); } (st, TimelineState::Loading) => { error!("ignoring transition from {st:?} into Loading state"); @@ -2648,7 +2655,7 @@ impl Timeline { async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> { //info!("PUT: key {} at {}", key, lsn); let layer = self.get_layer_for_write(lsn).await?; - layer.put_value(key, lsn, val)?; + layer.put_value(key, lsn, val).await?; Ok(()) } @@ -2674,7 +2681,9 @@ impl Timeline { Some(self.write_lock.lock().await) }; let mut guard = self.layers.write().await; - guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at); + guard + .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at) + .await; } /// Layer flusher task's main loop. @@ -2956,7 +2965,11 @@ impl Timeline { let frozen_layer = Arc::clone(frozen_layer); move || { // Write it out - let new_delta = frozen_layer.write_to_disk()?; + // Keep this inside `spawn_blocking` and `Handle::current` + // as long as the write path is still sync and the read impl + // is still not fully async. Otherwise executor threads would + // be blocked. + let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?; let new_delta_path = new_delta.path(); // Sync it to disk. @@ -3656,101 +3669,108 @@ impl Timeline { let mut key_values_total_size = 0u64; let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key - for &DeltaEntry { - key, lsn, ref val, .. - } in all_values_iter - { - let value = val.load()?; - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); - // We need to check key boundaries once we reach next key or end of layer with the same key - if !same_key || lsn == dup_end_lsn { - let mut next_key_size = 0u64; - let is_dup_layer = dup_end_lsn.is_valid(); - dup_start_lsn = Lsn::INVALID; - if !same_key { - dup_end_lsn = Lsn::INVALID; - } - // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size - for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { - next_key_size = next_size; - if key != next_key { - if dup_end_lsn.is_valid() { - // We are writting segment with duplicates: - // place all remaining values of this key in separate segment - dup_start_lsn = dup_end_lsn; // new segments starts where old stops - dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range + + // TODO remove this block_on wrapper once we fully go async + Handle::current().block_on(async { + for &DeltaEntry { + key, lsn, ref val, .. + } in all_values_iter + { + let value = value_ref.load().await?; + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + // We are writting segment with duplicates: + // place all remaining values of this key in separate segment + dup_start_lsn = dup_end_lsn; // new segments starts where old stops + dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range + } + break; + } + key_values_total_size += next_size; + // Check if it is time to split segment: if total keys size is larger than target file size. + // We need to avoid generation of empty segments if next_size > target_file_size. + if key_values_total_size > target_file_size && lsn != next_lsn { + // Split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn // new segment with duplicates starts where old one stops + } else { + lsn // start with the first LSN for this key + }; + dup_end_lsn = next_lsn; // upper LSN boundary is exclusive + break; } - break; } - key_values_total_size += next_size; - // Check if it is time to split segment: if total keys size is larger than target file size. - // We need to avoid generation of empty segments if next_size > target_file_size. - if key_values_total_size > target_file_size && lsn != next_lsn { - // Split key between multiple layers: such layer can contain only single key - dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn // new segment with duplicates starts where old one stops - } else { - lsn // start with the first LSN for this key - }; - dup_end_lsn = next_lsn; // upper LSN boundary is exclusive - break; + // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; } - } - // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. - if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - if writer.is_some() { - let written_size = writer.as_mut().unwrap().size(); - let contains_hole = - next_hole < holes.len() && key >= holes[next_hole].key_range.end; - // check if key cause layer overflow or contains hole... - if is_dup_layer - || dup_end_lsn.is_valid() - || written_size + key_values_total_size > target_file_size - || contains_hole - { - // ... if so, flush previous layer and prepare to write new one - new_layers.push(Arc::new( - writer.take().unwrap().finish(prev_key.unwrap().next())?, - )); - writer = None; - - if contains_hole { - // skip hole - next_hole += 1; + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + let contains_hole = + next_hole < holes.len() && key >= holes[next_hole].key_range.end; + // check if key cause layer overflow or contains hole... + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + || contains_hole + { + // ... if so, flush previous layer and prepare to write new one + new_layers.push(Arc::new( + writer.take().unwrap().finish(prev_key.unwrap().next())?, + )); + writer = None; + + if contains_hole { + // skip hole + next_hole += 1; + } } } + // Remember size of key value because at next iteration we will access next item + key_values_total_size = next_key_size; + } + if writer.is_none() { + // Create writer if not initiaized yet + writer = Some(DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + )?); } - // Remember size of key value because at next iteration we will access next item - key_values_total_size = next_key_size; - } - if writer.is_none() { - // Create writer if not initiaized yet - writer = Some(DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_id, - key, - if dup_end_lsn.is_valid() { - // this is a layer containing slice of values of the same key - debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); - dup_start_lsn..dup_end_lsn - } else { - debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); - lsn_range.clone() - }, - )?); - } - fail_point!("delta-layer-writer-fail-before-finish", |_| { - Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into()) - }); + fail_point!("delta-layer-writer-fail-before-finish", |_| { + Result::<_>::Err(anyhow::anyhow!( + "failpoint delta-layer-writer-fail-before-finish" + )) + }); - writer.as_mut().unwrap().put_value(key, lsn, value)?; - prev_key = Some(key); - } + writer.as_mut().unwrap().put_value(key, lsn, value)?; + prev_key = Some(key); + } + Ok(()) + })?; if let Some(writer) = writer { new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?)); } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index d99ca2d2927c2..dba6475c27587 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -219,27 +219,13 @@ async fn delete_local_layer_files( } }; - let r = if metadata.is_dir() { - // There shouldnt be any directories inside timeline dir as of current layout. + if metadata.is_dir() { + warn!(path=%entry.path().display(), "unexpected directory under timeline dir"); tokio::fs::remove_dir(entry.path()).await } else { tokio::fs::remove_file(entry.path()).await - }; - - if let Err(e) = r { - if e.kind() == std::io::ErrorKind::NotFound { - warn!( - timeline_dir=?local_timeline_directory, - path=?entry.path().display(), - "got not found err while removing timeline dir, proceeding anyway" - ); - continue; - } - anyhow::bail!(anyhow::anyhow!( - "Failed to remove: {}. Error: {e}", - entry.path().display() - )); } + .with_context(|| format!("Failed to remove: {}", entry.path().display()))?; } info!("finished deleting layer files, releasing layer_removal_cs.lock()"); @@ -359,10 +345,11 @@ impl DeleteTimelineFlow { // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! - #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))] + #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))] pub async fn run( tenant: &Arc, timeline_id: TimelineId, + inplace: bool, ) -> Result<(), DeleteTimelineError> { let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?; @@ -380,7 +367,11 @@ impl DeleteTimelineFlow { ))? }); - Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); + if inplace { + Self::background(guard, tenant.conf, tenant, &timeline).await? + } else { + Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); + } Ok(()) } @@ -398,6 +389,8 @@ impl DeleteTimelineFlow { } /// Shortcut to create Timeline in stopping state and spawn deletion task. + /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`] + #[instrument(skip_all, fields(%timeline_id))] pub async fn resume_deletion( tenant: Arc, timeline_id: TimelineId, @@ -444,11 +437,15 @@ impl DeleteTimelineFlow { Ok(()) } + #[instrument(skip_all, fields(%timeline_id))] pub async fn cleanup_remaining_timeline_fs_traces( tenant: &Tenant, timeline_id: TimelineId, ) -> anyhow::Result<()> { - cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await + let r = + cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await; + info!("Done"); + r } fn prepare( @@ -494,11 +491,17 @@ impl DeleteTimelineFlow { // At the end of the operation we're holding the guard and need to lock timelines map // to remove the timeline from it. // Always if you have two locks that are taken in different order this can result in a deadlock. - let delete_lock_guard = DeletionGuard( - Arc::clone(&timeline.delete_progress) - .try_lock_owned() - .map_err(|_| DeleteTimelineError::AlreadyInProgress)?, - ); + + let delete_progress = Arc::clone(&timeline.delete_progress); + let delete_lock_guard = match delete_progress.try_lock_owned() { + Ok(guard) => DeletionGuard(guard), + Err(_) => { + // Unfortunately if lock fails arc is consumed. + return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( + &timeline.delete_progress, + ))); + } + }; timeline.set_state(TimelineState::Stopping); @@ -553,10 +556,14 @@ impl DeleteTimelineFlow { remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; - *guard.0 = Self::Finished; + *guard = Self::Finished; Ok(()) } + + pub(crate) fn is_finished(&self) -> bool { + matches!(self, Self::Finished) + } } struct DeletionGuard(OwnedMutexGuard); diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 824d869bec275..40212265cf2d0 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -163,7 +163,7 @@ impl LayerManager { } /// Called from `freeze_inmem_layer`, returns true if successfully frozen. - pub fn try_freeze_in_memory_layer( + pub async fn try_freeze_in_memory_layer( &mut self, Lsn(last_record_lsn): Lsn, last_freeze_at: &AtomicLsn, @@ -173,7 +173,7 @@ impl LayerManager { if let Some(open_layer) = &self.layer_map.open_layer { let open_layer_rc = Arc::clone(open_layer); // Does this layer need freezing? - open_layer.freeze(end_lsn); + open_layer.freeze(end_lsn).await; // The layer is no longer open, update the layer map to reflect this. // We will replace it with on-disk historics below. diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index e96aa41da0b79..d52b29f1756ef 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -31,8 +31,10 @@ use storage_broker::Streaming; use tokio::select; use tracing::*; -use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; use postgres_connection::{parse_host_port, PgConnectionConfig}; +use utils::backoff::{ + exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, +}; use utils::{ id::{NodeId, TenantTimelineId}, lsn::Lsn, diff --git a/pgxn/neon/walproposer_utils.c b/pgxn/neon/walproposer_utils.c index e1dcaa081d04f..9e1fc11756a20 100644 --- a/pgxn/neon/walproposer_utils.c +++ b/pgxn/neon/walproposer_utils.c @@ -37,68 +37,14 @@ static XLogSegNo walpropSegNo = 0; /* START cloned file-local variables and functions from walsender.c */ -/* - * xlogreader used for replication. Note that a WAL sender doing physical - * replication does not need xlogreader to read WAL, but it needs one to - * keep a state of its work. - */ -static XLogReaderState *xlogreader = NULL; - -/* - * These variables keep track of the state of the timeline we're currently - * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric, - * the timeline is not the latest timeline on this server, and the server's - * history forked off from that timeline at sendTimeLineValidUpto. - */ -static TimeLineID sendTimeLine = 0; -static TimeLineID sendTimeLineNextTLI = 0; -static bool sendTimeLineIsHistoric = false; -static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr; - -/* - * Timestamp of last ProcessRepliesIfAny() that saw a reply from the - * standby. Set to 0 if wal_sender_timeout doesn't need to be active. - */ -static TimestampTz last_reply_timestamp = 0; - -/* Have we sent a heartbeat message asking for reply, since last reply? */ -static bool waiting_for_ping_response = false; - -static bool streamingDoneSending; -static bool streamingDoneReceiving; - -/* Are we there yet? */ -static bool WalSndCaughtUp = false; - -/* Flags set by signal handlers for later service in main loop */ -static volatile sig_atomic_t got_STOPPING = false; - /* * How far have we sent WAL already? This is also advertised in * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.) */ static XLogRecPtr sentPtr = InvalidXLogRecPtr; -/* - * This is set while we are streaming. When not set - * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set, - * the main loop is responsible for checking got_STOPPING and terminating when - * it's set (after streaming any remaining WAL). - */ -static volatile sig_atomic_t replication_active = false; - -typedef void (*WalSndSendDataCallback) (void); -static void WalSndLoop(WalSndSendDataCallback send_data); -static void XLogSendPhysical(void); -#if PG_VERSION_NUM >= 150000 -static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli); -#else -static XLogRecPtr GetStandbyFlushRecPtr(void); -#endif - -static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, - TimeLineID *tli_p); - +static void WalSndLoop(void); +static void XLogBroadcastWalProposer(void); /* END cloned file-level variables and functions from walsender.c */ int @@ -506,7 +452,7 @@ XLogWalPropClose(XLogRecPtr recptr) /* START of cloned functions from walsender.c */ /* - * Handle START_REPLICATION command. + * Subscribe for new WAL and stream it in the loop to safekeepers. * * At the moment, this never returns, but an ereport(ERROR) will take us back * to the main loop. @@ -524,18 +470,6 @@ StartProposerReplication(StartReplicationCmd *cmd) errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION"))); #endif - /* create xlogreader for physical replication */ - xlogreader = - XLogReaderAllocate(wal_segment_size, NULL, - XL_ROUTINE(.segment_open = WalSndSegmentOpen, - .segment_close = wal_segment_close), - NULL); - - if (!xlogreader) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - /* * We assume here that we're logging enough information in the WAL for * log-shipping, since this is checked in PostmasterMain(). @@ -569,341 +503,61 @@ StartProposerReplication(StartReplicationCmd *cmd) * we keep this code around to lighten the load for when we need it. */ #if PG_VERSION_NUM >= 150000 - if (am_cascading_walsender) - { - /* this also updates ThisTimeLineID */ - FlushPtr = GetStandbyFlushRecPtr(&currTLI); - } - else - FlushPtr = GetFlushRecPtr(&currTLI); + FlushPtr = GetFlushRecPtr(&currTLI); #else - if (am_cascading_walsender) - { - /* this also updates ThisTimeLineID */ - FlushPtr = GetStandbyFlushRecPtr(); - } - else - FlushPtr = GetFlushRecPtr(); - + FlushPtr = GetFlushRecPtr(); currTLI = ThisTimeLineID; #endif + /* + * When we first start replication the standby will be behind the + * primary. For some applications, for example synchronous + * replication, it is important to have a clear state for this initial + * catchup mode, so we can trigger actions when we change streaming + * state later. We may stay in this state for a long time, which is + * exactly why we want to be able to monitor whether or not we are + * still here. + */ + WalSndSetState(WALSNDSTATE_CATCHUP); - if (cmd->timeline != 0) - { - XLogRecPtr switchpoint; - - sendTimeLine = cmd->timeline; - if (sendTimeLine == currTLI) - { - sendTimeLineIsHistoric = false; - sendTimeLineValidUpto = InvalidXLogRecPtr; - } - else - { - List *timeLineHistory; - - sendTimeLineIsHistoric = true; - - /* - * Check that the timeline the client requested exists, and the - * requested start location is on that timeline. - */ - timeLineHistory = readTimeLineHistory(currTLI); - switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory, - &sendTimeLineNextTLI); - list_free_deep(timeLineHistory); - - /* - * Found the requested timeline in the history. Check that - * requested startpoint is on that timeline in our history. - * - * This is quite loose on purpose. We only check that we didn't - * fork off the requested timeline before the switchpoint. We - * don't check that we switched *to* it before the requested - * starting point. This is because the client can legitimately - * request to start replication from the beginning of the WAL - * segment that contains switchpoint, but on the new timeline, so - * that it doesn't end up with a partial segment. If you ask for - * too old a starting point, you'll get an error later when we - * fail to find the requested WAL segment in pg_wal. - * - * XXX: we could be more strict here and only allow a startpoint - * that's older than the switchpoint, if it's still in the same - * WAL segment. - */ - if (!XLogRecPtrIsInvalid(switchpoint) && - switchpoint < cmd->startpoint) - { - ereport(ERROR, - (errmsg("requested starting point %X/%X on timeline %u is not in this server's history", - LSN_FORMAT_ARGS(cmd->startpoint), - cmd->timeline), - errdetail("This server's history forked from timeline %u at %X/%X.", - cmd->timeline, - LSN_FORMAT_ARGS(switchpoint)))); - } - sendTimeLineValidUpto = switchpoint; - } - } - else + /* + * Don't allow a request to stream from a future point in WAL that + * hasn't been flushed to disk in this server yet. + */ + if (FlushPtr < cmd->startpoint) { - sendTimeLine = currTLI; - sendTimeLineValidUpto = InvalidXLogRecPtr; - sendTimeLineIsHistoric = false; + ereport(ERROR, + (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", + LSN_FORMAT_ARGS(cmd->startpoint), + LSN_FORMAT_ARGS(FlushPtr)))); } - streamingDoneSending = streamingDoneReceiving = false; + /* Start streaming from the requested point */ + sentPtr = cmd->startpoint; - /* If there is nothing to stream, don't even enter COPY mode */ - if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto) - { - /* - * When we first start replication the standby will be behind the - * primary. For some applications, for example synchronous - * replication, it is important to have a clear state for this initial - * catchup mode, so we can trigger actions when we change streaming - * state later. We may stay in this state for a long time, which is - * exactly why we want to be able to monitor whether or not we are - * still here. - */ - WalSndSetState(WALSNDSTATE_CATCHUP); - - /* - * Don't allow a request to stream from a future point in WAL that - * hasn't been flushed to disk in this server yet. - */ - if (FlushPtr < cmd->startpoint) - { - ereport(ERROR, - (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", - LSN_FORMAT_ARGS(cmd->startpoint), - LSN_FORMAT_ARGS(FlushPtr)))); - } + /* Initialize shared memory status, too */ + SpinLockAcquire(&MyWalSnd->mutex); + MyWalSnd->sentPtr = sentPtr; + SpinLockRelease(&MyWalSnd->mutex); - /* Start streaming from the requested point */ - sentPtr = cmd->startpoint; + SyncRepInitConfig(); - /* Initialize shared memory status, too */ - SpinLockAcquire(&MyWalSnd->mutex); - MyWalSnd->sentPtr = sentPtr; - SpinLockRelease(&MyWalSnd->mutex); + /* Infinite send loop, never returns */ + WalSndLoop(); - SyncRepInitConfig(); - - /* Main loop of walsender */ - replication_active = true; - - WalSndLoop(XLogSendPhysical); - - replication_active = false; - if (got_STOPPING) - proc_exit(0); - WalSndSetState(WALSNDSTATE_STARTUP); - - Assert(streamingDoneSending && streamingDoneReceiving); - } + WalSndSetState(WALSNDSTATE_STARTUP); if (cmd->slotname) ReplicationSlotRelease(); - - /* - * Copy is finished now. Send a single-row result set indicating the next - * timeline. - */ - if (sendTimeLineIsHistoric) - { - char startpos_str[8 + 1 + 8 + 1]; - DestReceiver *dest; - TupOutputState *tstate; - TupleDesc tupdesc; - Datum values[2]; - bool nulls[2]; - - snprintf(startpos_str, sizeof(startpos_str), "%X/%X", - LSN_FORMAT_ARGS(sendTimeLineValidUpto)); - - dest = CreateDestReceiver(DestRemoteSimple); - MemSet(nulls, false, sizeof(nulls)); - - /* - * Need a tuple descriptor representing two columns. int8 may seem - * like a surprising data type for this, but in theory int4 would not - * be wide enough for this, as TimeLineID is unsigned. - */ - tupdesc = CreateTemplateTupleDesc(2); - TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli", - INT8OID, -1, 0); - TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos", - TEXTOID, -1, 0); - - /* prepare for projection of tuple */ - tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual); - - values[0] = Int64GetDatum((int64) sendTimeLineNextTLI); - values[1] = CStringGetTextDatum(startpos_str); - - /* send it to dest */ - do_tup_output(tstate, values, nulls); - - end_tup_output(tstate); - } - - /* Send CommandComplete message */ - EndReplicationCommand("START_STREAMING"); } -#if PG_VERSION_NUM >= 150000 -static XLogRecPtr -GetStandbyFlushRecPtr(TimeLineID *tli) -{ - XLogRecPtr replayPtr; - TimeLineID replayTLI; - XLogRecPtr receivePtr; - TimeLineID receiveTLI; - XLogRecPtr result; - - /* - * We can safely send what's already been replayed. Also, if walreceiver - * is streaming WAL from the same timeline, we can send anything that it - * has streamed, but hasn't been replayed yet. - */ - - receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); - replayPtr = GetXLogReplayRecPtr(&replayTLI); - - *tli = replayTLI; - - result = replayPtr; - if (receiveTLI == replayTLI && receivePtr > replayPtr) - result = receivePtr; - - return result; -} -#else /* - * Returns the latest point in WAL that has been safely flushed to disk, and - * can be sent to the standby. This should only be called when in recovery, - * ie. we're streaming to a cascaded standby. - * - * As a side-effect, ThisTimeLineID is updated to the TLI of the last - * replayed WAL record. + * Main loop that waits for LSN updates and calls the walproposer. + * Synchronous replication sets latch in WalSndWakeup at walsender.c */ -static XLogRecPtr -GetStandbyFlushRecPtr(void) -{ - XLogRecPtr replayPtr; - TimeLineID replayTLI; - XLogRecPtr receivePtr; - TimeLineID receiveTLI; - XLogRecPtr result; - - /* - * We can safely send what's already been replayed. Also, if walreceiver - * is streaming WAL from the same timeline, we can send anything that it - * has streamed, but hasn't been replayed yet. - */ - - receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI); - replayPtr = GetXLogReplayRecPtr(&replayTLI); - - ThisTimeLineID = replayTLI; - - result = replayPtr; - if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr) - result = receivePtr; - - return result; -} -#endif - - - -/* XLogReaderRoutine->segment_open callback */ -static void -WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, - TimeLineID *tli_p) -{ - char path[MAXPGPATH]; - - /*------- - * When reading from a historic timeline, and there is a timeline switch - * within this segment, read from the WAL segment belonging to the new - * timeline. - * - * For example, imagine that this server is currently on timeline 5, and - * we're streaming timeline 4. The switch from timeline 4 to 5 happened at - * 0/13002088. In pg_wal, we have these files: - * - * ... - * 000000040000000000000012 - * 000000040000000000000013 - * 000000050000000000000013 - * 000000050000000000000014 - * ... - * - * In this situation, when requested to send the WAL from segment 0x13, on - * timeline 4, we read the WAL from file 000000050000000000000013. Archive - * recovery prefers files from newer timelines, so if the segment was - * restored from the archive on this server, the file belonging to the old - * timeline, 000000040000000000000013, might not exist. Their contents are - * equal up to the switchpoint, because at a timeline switch, the used - * portion of the old segment is copied to the new file. ------- - */ - *tli_p = sendTimeLine; - if (sendTimeLineIsHistoric) - { - XLogSegNo endSegNo; - - XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize); - if (nextSegNo == endSegNo) - *tli_p = sendTimeLineNextTLI; - } - - XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize); - state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); - if (state->seg.ws_file >= 0) - return; - - /* - * If the file is not found, assume it's because the standby asked for a - * too old WAL segment that has already been removed or recycled. - */ - if (errno == ENOENT) - { - char xlogfname[MAXFNAMELEN]; - int save_errno = errno; - - XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size); - errno = save_errno; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("requested WAL segment %s has already been removed", - xlogfname))); - } - else - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); -} - - -/* Main loop of walsender process that streams the WAL over Copy messages. */ static void -WalSndLoop(WalSndSendDataCallback send_data) +WalSndLoop(void) { - /* - * Initialize the last reply timestamp. That enables timeout processing - * from hereon. - */ - last_reply_timestamp = GetCurrentTimestamp(); - waiting_for_ping_response = false; - - /* - * Loop until we reach the end of this timeline or the client requests to - * stop streaming. - */ for (;;) { /* Clear any already-pending wakeups */ @@ -911,153 +565,41 @@ WalSndLoop(WalSndSendDataCallback send_data) CHECK_FOR_INTERRUPTS(); - /* Process any requests or signals received recently */ - if (ConfigReloadPending) - { - ConfigReloadPending = false; - ProcessConfigFile(PGC_SIGHUP); - SyncRepInitConfig(); - } + XLogBroadcastWalProposer(); - /* always true */ - if (am_wal_proposer) - { - send_data(); - if (WalSndCaughtUp) - { - if (MyWalSnd->state == WALSNDSTATE_CATCHUP) - WalSndSetState(WALSNDSTATE_STREAMING); - WalProposerPoll(); - WalSndCaughtUp = false; - } - continue; - } + if (MyWalSnd->state == WALSNDSTATE_CATCHUP) + WalSndSetState(WALSNDSTATE_STREAMING); + WalProposerPoll(); } } /* - * Send out the WAL in its normal physical/stored form. - * - * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, - * but not yet sent to the client, and buffer it in the libpq output - * buffer. - * - * If there is no unsent WAL remaining, WalSndCaughtUp is set to true, - * otherwise WalSndCaughtUp is set to false. + * Notify walproposer about the new WAL position. */ static void -XLogSendPhysical(void) +XLogBroadcastWalProposer(void) { - XLogRecPtr SendRqstPtr; XLogRecPtr startptr; XLogRecPtr endptr; - Size nbytes PG_USED_FOR_ASSERTS_ONLY; - TimeLineID currTLI; - - /* If requested switch the WAL sender to the stopping state. */ - if (got_STOPPING) - WalSndSetState(WALSNDSTATE_STOPPING); - - if (streamingDoneSending) - { - WalSndCaughtUp = true; - return; - } - /* Figure out how far we can safely send the WAL. */ - if (sendTimeLineIsHistoric) - { - /* - * Streaming an old timeline that's in this server's history, but is - * not the one we're currently inserting or replaying. It can be - * streamed up to the point where we switched off that timeline. - */ - SendRqstPtr = sendTimeLineValidUpto; - } - else if (am_cascading_walsender) - { - /* - * Streaming the latest timeline on a standby. - * - * Attempt to send all WAL that has already been replayed, so that we - * know it's valid. If we're receiving WAL through streaming - * replication, it's also OK to send any WAL that has been received - * but not replayed. - * - * The timeline we're recovering from can change, or we can be - * promoted. In either case, the current timeline becomes historic. We - * need to detect that so that we don't try to stream past the point - * where we switched to another timeline. We check for promotion or - * timeline switch after calculating FlushPtr, to avoid a race - * condition: if the timeline becomes historic just after we checked - * that it was still current, it's still be OK to stream it up to the - * FlushPtr that was calculated before it became historic. - */ - bool becameHistoric = false; -#if PG_VERSION_NUM >= 150000 - SendRqstPtr = GetStandbyFlushRecPtr(&currTLI); -#else - SendRqstPtr = GetStandbyFlushRecPtr(); - currTLI = ThisTimeLineID; -#endif - if (!RecoveryInProgress()) - { - /* - * We have been promoted. RecoveryInProgress() updated - * ThisTimeLineID to the new current timeline. - */ - am_cascading_walsender = false; - becameHistoric = true; - } - else - { - /* - * Still a cascading standby. But is the timeline we're sending - * still the one recovery is recovering from? currTLI was updated - * by the GetStandbyFlushRecPtr() call above. - */ - if (sendTimeLine != currTLI) - becameHistoric = true; - } - - if (becameHistoric) - { - /* - * The timeline we were sending has become historic. Read the - * timeline history file of the new timeline to see where exactly - * we forked off from the timeline we were sending. - */ - List *history; - - history = readTimeLineHistory(currTLI); - sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); - - Assert(sendTimeLine < sendTimeLineNextTLI); - list_free_deep(history); - - sendTimeLineIsHistoric = true; + /* Start from the last sent position */ + startptr = sentPtr; - SendRqstPtr = sendTimeLineValidUpto; - } - } - else - { - /* - * Streaming the current timeline on a primary. - * - * Attempt to send all data that's already been written out and - * fsync'd to disk. We cannot go further than what's been written out - * given the current implementation of WALRead(). And in any case - * it's unsafe to send WAL that is not securely down to disk on the - * primary: if the primary subsequently crashes and restarts, standbys - * must not have applied any WAL that got lost on the primary. - */ + /* + * Streaming the current timeline on a primary. + * + * Attempt to send all data that's already been written out and + * fsync'd to disk. We cannot go further than what's been written out + * given the current implementation of WALRead(). And in any case + * it's unsafe to send WAL that is not securely down to disk on the + * primary: if the primary subsequently crashes and restarts, standbys + * must not have applied any WAL that got lost on the primary. + */ #if PG_VERSION_NUM >= 150000 - SendRqstPtr = GetFlushRecPtr(NULL); + endptr = GetFlushRecPtr(NULL); #else - SendRqstPtr = GetFlushRecPtr(); + endptr = GetFlushRecPtr(); #endif - } /* * Record the current system time as an approximation of the time at which @@ -1083,91 +625,14 @@ XLogSendPhysical(void) * that arbitrary LSN is eventually reported as written, flushed and * applied, so that it can measure the elapsed time. */ - LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp()); - - /* - * If this is a historic timeline and we've reached the point where we - * forked to the next timeline, stop streaming. - * - * Note: We might already have sent WAL > sendTimeLineValidUpto. The - * startup process will normally replay all WAL that has been received - * from the primary, before promoting, but if the WAL streaming is - * terminated at a WAL page boundary, the valid portion of the timeline - * might end in the middle of a WAL record. We might've already sent the - * first half of that partial WAL record to the cascading standby, so that - * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't - * replay the partial WAL record either, so it can still follow our - * timeline switch. - */ - if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr) - { - /* close the current file. */ - if (xlogreader->seg.ws_file >= 0) - wal_segment_close(xlogreader); - - /* Send CopyDone */ - pq_putmessage_noblock('c', NULL, 0); - streamingDoneSending = true; - - WalSndCaughtUp = true; - - elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", - LSN_FORMAT_ARGS(sendTimeLineValidUpto), - LSN_FORMAT_ARGS(sentPtr)); - return; - } + LagTrackerWrite(endptr, GetCurrentTimestamp()); /* Do we have any work to do? */ - Assert(sentPtr <= SendRqstPtr); - if (SendRqstPtr <= sentPtr) - { - WalSndCaughtUp = true; + Assert(startptr <= endptr); + if (endptr <= startptr) return; - } - - /* - * Figure out how much to send in one message. If there's no more than - * MAX_SEND_SIZE bytes to send, send everything. Otherwise send - * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. - * - * The rounding is not only for performance reasons. Walreceiver relies on - * the fact that we never split a WAL record across two messages. Since a - * long WAL record is split at page boundary into continuation records, - * page boundary is always a safe cut-off point. We also assume that - * SendRqstPtr never points to the middle of a WAL record. - */ - startptr = sentPtr; - endptr = startptr; - endptr += MAX_SEND_SIZE; - /* if we went beyond SendRqstPtr, back off */ - if (SendRqstPtr <= endptr) - { - endptr = SendRqstPtr; - if (sendTimeLineIsHistoric) - WalSndCaughtUp = false; - else - WalSndCaughtUp = true; - } - else - { - /* round down to page boundary. */ - endptr -= (endptr % XLOG_BLCKSZ); - WalSndCaughtUp = false; - } - - nbytes = endptr - startptr; - Assert(nbytes <= MAX_SEND_SIZE); - - /* always true */ - if (am_wal_proposer) - { - WalProposerBroadcast(startptr, endptr); - } - else - { - /* code removed for brevity */ - } + WalProposerBroadcast(startptr, endptr); sentPtr = endptr; /* Update shared memory status */ diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 22e766b5f13b9..3f4cee6e3483c 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -8,6 +8,7 @@ use super::{ use crate::{auth::ClientCredentials, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; +use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; @@ -47,7 +48,9 @@ impl Api { .build()?; info!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); let response = self.endpoint.execute(request).await?; + info!(duration = ?start.elapsed(), "received http response"); let body = match parse_body::(response).await { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. @@ -88,7 +91,9 @@ impl Api { .build()?; info!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); let response = self.endpoint.execute(request).await?; + info!(duration = ?start.elapsed(), "received http response"); let body = parse_body::(response).await?; // Unfortunately, ownership won't let us use `Option::ok_or` here. diff --git a/proxy/src/http.rs b/proxy/src/http.rs index e2812403806d5..755137807bc49 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -7,11 +7,14 @@ pub mod server; pub mod sql_over_http; pub mod websocket; -use std::time::Duration; +use std::{sync::Arc, time::Duration}; +use futures::FutureExt; pub use reqwest::{Request, Response, StatusCode}; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::time::Instant; +use tracing::trace; use crate::url::ApiUrl; use reqwest_middleware::RequestBuilder; @@ -20,13 +23,21 @@ use reqwest_middleware::RequestBuilder; /// because it takes care of observability (OpenTelemetry). /// We deliberately don't want to replace this with a public static. pub fn new_client() -> ClientWithMiddleware { - reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + let client = reqwest::ClientBuilder::new() + .dns_resolver(Arc::new(GaiResolver::default())) + .connection_verbose(true) + .build() + .expect("Failed to create http client"); + + reqwest_middleware::ClientBuilder::new(client) .with(reqwest_tracing::TracingMiddleware::default()) .build() } pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware { let timeout_client = reqwest::ClientBuilder::new() + .dns_resolver(Arc::new(GaiResolver::default())) + .connection_verbose(true) .timeout(default_timout) .build() .expect("Failed to create http client with timeout"); @@ -39,6 +50,10 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware // As per docs, "This middleware always errors when given requests with streaming bodies". // That's all right because we only use this client to send `serde_json::RawValue`, which // is not a stream. + // + // ex-maintainer note: + // this limitation can be fixed if streaming is necessary. + // retries will still not be performed, but it wont error immediately .with(RetryTransientMiddleware::new_with_policy(retry_policy)) .build() } @@ -81,6 +96,37 @@ impl Endpoint { } } +/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html +use hyper::{ + client::connect::dns::{GaiResolver as HyperGaiResolver, Name}, + service::Service, +}; +use reqwest::dns::{Addrs, Resolve, Resolving}; +#[derive(Debug)] +pub struct GaiResolver(HyperGaiResolver); + +impl Default for GaiResolver { + fn default() -> Self { + Self(HyperGaiResolver::new()) + } +} + +impl Resolve for GaiResolver { + fn resolve(&self, name: Name) -> Resolving { + let this = &mut self.0.clone(); + let start = Instant::now(); + Box::pin( + Service::::call(this, name.clone()).map(move |result| { + let resolve_duration = start.elapsed(); + trace!(duration = ?resolve_duration, addr = %name, "resolve host complete"); + result + .map(|addrs| -> Addrs { Box::new(addrs) }) + .map_err(|err| -> Box { Box::new(err) }) + }), + ) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs index a4eb13a10ba07..aa06a91bbae9d 100644 --- a/proxy/src/http/sql_over_http.rs +++ b/proxy/src/http/sql_over_http.rs @@ -27,14 +27,19 @@ struct QueryData { params: Vec, } +#[derive(serde::Deserialize)] +struct BatchQueryData { + queries: Vec, +} + #[derive(serde::Deserialize)] #[serde(untagged)] enum Payload { Single(QueryData), - Batch(Vec), + Batch(BatchQueryData), } -pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB +pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); @@ -214,7 +219,7 @@ pub async fn handle( if request_content_length > MAX_REQUEST_SIZE { return Err(anyhow::anyhow!( - "request is too large (max {MAX_REQUEST_SIZE} bytes)" + "request is too large (max is {MAX_REQUEST_SIZE} bytes)" )); } @@ -233,7 +238,7 @@ pub async fn handle( Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode) .await .map(|x| (x, HashMap::default())), - Payload::Batch(queries) => { + Payload::Batch(batch_query) => { let mut results = Vec::new(); let mut builder = client.build_transaction(); if let Some(isolation_level) = txn_isolation_level { @@ -243,7 +248,7 @@ pub async fn handle( builder = builder.read_only(true); } let transaction = builder.start().await?; - for query in queries { + for query in batch_query.queries { let result = query_to_json(&transaction, query, raw_output, array_mode).await; match result { Ok(r) => results.push(r), @@ -292,13 +297,15 @@ async fn query_to_json( // big. pin_mut!(row_stream); let mut rows: Vec = Vec::new(); - let mut curret_size = 0; + let mut current_size = 0; while let Some(row) = row_stream.next().await { let row = row?; - curret_size += row.body_len(); + current_size += row.body_len(); rows.push(row); - if curret_size > MAX_RESPONSE_SIZE { - return Err(anyhow::anyhow!("response too large")); + if current_size > MAX_RESPONSE_SIZE { + return Err(anyhow::anyhow!( + "response is too large (max is {MAX_RESPONSE_SIZE} bytes)" + )); } } diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index fec76c74f4943..794da1792940f 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -187,12 +187,16 @@ async fn ws_handler( let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) .map_err(|e| ApiError::BadRequest(e.into()))?; - tokio::spawn(async move { - if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await - { - error!(session_id = ?session_id, "error in websocket connection: {e:?}"); + tokio::spawn( + async move { + if let Err(e) = + serve_websocket(websocket, config, &cancel_map, session_id, host).await + { + error!(session_id = ?session_id, "error in websocket connection: {e:#}"); + } } - }); + .in_current_span(), + ); // Return the response so the spawned future can continue. Ok(response) @@ -217,6 +221,10 @@ async fn ws_handler( }, None => Value::Null, }; + error!( + ?code, + "sql-over-http per-client task finished with an error: {e:#}" + ); ( json!({ "message": message, "code": code }), HashMap::default(), diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index f11075a736cab..136d62b321026 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,8 +2,8 @@ //! protocol commands. use anyhow::Context; -use std::str; use std::str::FromStr; +use std::str::{self}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, info_span, Instrument}; @@ -11,6 +11,7 @@ use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED}; +use crate::safekeeper::Term; use crate::timeline::TimelineError; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; @@ -44,7 +45,7 @@ pub struct SafekeeperPostgresHandler { /// Parsed Postgres command. enum SafekeeperPostgresCommand { StartWalPush, - StartReplication { start_lsn: Lsn }, + StartReplication { start_lsn: Lsn, term: Option }, IdentifySystem, TimelineStatus, JSONCtrl { cmd: AppendLogicalMessage }, @@ -55,15 +56,21 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { Ok(SafekeeperPostgresCommand::StartWalPush) } else if cmd.starts_with("START_REPLICATION") { let re = Regex::new( - r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)", + // We follow postgres START_REPLICATION LOGICAL options to pass term. + r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)(?: \(term='(\d+)'\))?", ) .unwrap(); - let mut caps = re.captures_iter(cmd); - let start_lsn = caps - .next() - .map(|cap| Lsn::from_str(&cap[1])) - .context("parse start LSN from START_REPLICATION command")??; - Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) + let caps = re + .captures(cmd) + .context(format!("failed to parse START_REPLICATION command {}", cmd))?; + let start_lsn = + Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?; + let term = if let Some(m) = caps.get(2) { + Some(m.as_str().parse::().context("invalid term")?) + } else { + None + }; + Ok(SafekeeperPostgresCommand::StartReplication { start_lsn, term }) } else if cmd.starts_with("IDENTIFY_SYSTEM") { Ok(SafekeeperPostgresCommand::IdentifySystem) } else if cmd.starts_with("TIMELINE_STATUS") { @@ -218,8 +225,8 @@ impl postgres_backend::Handler .instrument(info_span!("WAL receiver", ttid = %span_ttid)) .await } - SafekeeperPostgresCommand::StartReplication { start_lsn } => { - self.handle_start_replication(pgb, start_lsn) + SafekeeperPostgresCommand::StartReplication { start_lsn, term } => { + self.handle_start_replication(pgb, start_lsn, term) .instrument(info_span!("WAL sender", ttid = %span_ttid)) .await } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index a0d4bb35d709a..808177d2c439a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,6 +2,7 @@ //! with the "START_REPLICATION" message, and registry of walsenders. use crate::handler::SafekeeperPostgresHandler; +use crate::safekeeper::Term; use crate::timeline::Timeline; use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; @@ -359,8 +360,12 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, start_pos: Lsn, + term: Option, ) -> Result<(), QueryError> { - if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await { + if let Err(end) = self + .handle_start_replication_guts(pgb, start_pos, term) + .await + { // Log the result and probably send it to the client, closing the stream. pgb.handle_copy_stream_end(end).await; } @@ -371,6 +376,7 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, start_pos: Lsn, + term: Option, ) -> Result<(), CopyStreamHandlerEnd> { let appname = self.appname.clone(); let tli = @@ -440,6 +446,7 @@ impl SafekeeperPostgresHandler { start_pos, end_pos, stop_pos, + term, commit_lsn_watch_rx, ws_guard: ws_guard.clone(), wal_reader, @@ -476,6 +483,10 @@ struct WalSender<'a, IO> { // If present, terminate after reaching this position; used by walproposer // in recovery. stop_pos: Option, + /// When streaming uncommitted part, the term the client acts as the leader + /// in. Streaming is stopped if local term changes to a different (higher) + /// value. + term: Option, commit_lsn_watch_rx: Receiver, ws_guard: Arc, wal_reader: WalReader, @@ -518,8 +529,18 @@ impl WalSender<'_, IO> { .0 as usize; send_size = min(send_size, self.send_buf.len()); let send_buf = &mut self.send_buf[..send_size]; - // read wal into buffer - send_size = self.wal_reader.read(send_buf).await?; + let send_size: usize; + { + // If uncommitted part is being pulled, check that the term is + // still the expected one. + let _term_guard = if let Some(t) = self.term { + Some(self.tli.acquire_term(t).await?) + } else { + None + }; + // read wal into buffer + send_size = self.wal_reader.read(send_buf).await? + }; let send_buf = &send_buf[..send_size]; // and send it diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 9ec89b86d4375..df51c45affbf5 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -499,6 +499,19 @@ impl Timeline { false } + /// Ensure taht current term is t, erroring otherwise, and lock the state. + pub async fn acquire_term(&self, t: Term) -> Result> { + let ss = self.write_shared_state().await; + if ss.sk.state.acceptor_state.term != t { + bail!( + "failed to acquire term {}, current term {}", + t, + ss.sk.state.acceptor_state.term + ); + } + Ok(ss) + } + /// Returns whether s3 offloading is required and sets current status as /// matching it. pub async fn wal_backup_attend(&self) -> bool { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7ba979745b922..7c7ca941639d2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -32,6 +32,7 @@ from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest +from mypy_boto3_s3 import S3Client # Type-related stuff from psycopg2.extensions import connection as PgConnection @@ -432,7 +433,7 @@ def __init__( self.port_distributor = port_distributor self.remote_storage = remote_storage self.ext_remote_storage: Optional[S3Storage] = None - self.remote_storage_client: Optional[Any] = None + self.remote_storage_client: Optional[S3Client] = None self.remote_storage_users = remote_storage_users self.broker = broker self.run_id = run_id @@ -875,7 +876,14 @@ def get_safekeeper_connstrs(self) -> str: def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" - return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id) + + def tenant_dir( + self, + tenant_id: TenantId, + ) -> Path: + """Get a tenant directory's path based on the repo directory of the test environment""" + return self.repo_dir / "tenants" / str(tenant_id) def get_pageserver_version(self) -> str: bin_pageserver = str(self.neon_binpath / "pageserver") @@ -1518,6 +1526,8 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally ".*Error processing HTTP request: NotFound: Timeline .* was not found", ".*took more than expected to complete.*", + # these can happen during shutdown, but it should not be a reason to fail a test + ".*completed, took longer than expected.*", ] def start( @@ -2817,8 +2827,15 @@ def check_restored_datadir_content( endpoint: Endpoint, ): # Get the timeline ID. We need it for the 'basebackup' command - timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) + + # many tests already checkpoint, but do it just in case + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CHECKPOINT") + # wait for pageserver to catch up + wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id) # stop postgres to ensure that files won't change endpoint.stop() @@ -2833,7 +2850,7 @@ def check_restored_datadir_content( {psql_path} \ --no-psqlrc \ postgres://localhost:{env.pageserver.service_port.pg} \ - -c 'basebackup {endpoint.tenant_id} {timeline}' \ + -c 'basebackup {endpoint.tenant_id} {timeline_id}' \ | tar -x -C {restored_dir_path} """ diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 8c053c8073abc..a179ebdd09aeb 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -210,6 +210,10 @@ def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) self.verbose_error(res) + def tenant_delete(self, tenant_id: TenantId): + res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + self.verbose_error(res) + def tenant_load(self, tenant_id: TenantId): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load") self.verbose_error(res) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 119c99bb96846..6032ff5b6810f 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,9 +1,11 @@ import time -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.remote_storage import RemoteStorageKind, S3Storage from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until def assert_tenant_state( @@ -17,15 +19,6 @@ def assert_tenant_state( assert tenant_status["state"]["slug"] == expected_state, message or tenant_status -def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId): - tenants = pageserver_http.tenant_list() - matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] - assert len(matching) < 2 - if len(matching) == 0: - return None - return matching[0] - - def remote_consistent_lsn( pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId ) -> Lsn: @@ -199,20 +192,19 @@ def wait_timeline_detail_404( timeline_id: TimelineId, iterations: int, ): - last_exc = None - for _ in range(iterations): - time.sleep(0.250) + def timeline_is_missing(): + data = {} try: data = pageserver_http.timeline_detail(tenant_id, timeline_id) - log.info(f"detail {data}") + log.info(f"timeline detail {data}") except PageserverApiException as e: log.debug(e) if e.status_code == 404: return - last_exc = e + raise RuntimeError(f"Timeline exists state {data.get('state')}") - raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}") + wait_until(iterations, interval=0.250, func=timeline_is_missing) def timeline_delete_wait_completed( @@ -224,3 +216,72 @@ def timeline_delete_wait_completed( ): pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations) + + +if TYPE_CHECKING: + # TODO avoid by combining remote storage related stuff in single type + # and just passing in this type instead of whole builder + from fixtures.neon_fixtures import NeonEnvBuilder + + +def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None): + # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api. + assert neon_env_builder.remote_storage_kind in ( + RemoteStorageKind.MOCK_S3, + RemoteStorageKind.REAL_S3, + ) + # For mypy + assert isinstance(neon_env_builder.remote_storage, S3Storage) + assert neon_env_builder.remote_storage_client is not None + + # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. + response = neon_env_builder.remote_storage_client.list_objects_v2( + Bucket=neon_env_builder.remote_storage.bucket_name, + Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "", + ) + objects = response.get("Contents") + assert ( + response["KeyCount"] == 0 + ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}" + + +def wait_tenant_status_404( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + iterations: int, + interval: float = 0.250, +): + def tenant_is_missing(): + data = {} + try: + data = pageserver_http.tenant_status(tenant_id) + log.info(f"tenant status {data}") + except PageserverApiException as e: + log.debug(e) + if e.status_code == 404: + return + + raise RuntimeError(f"Timeline exists state {data.get('state')}") + + wait_until(iterations, interval=interval, func=tenant_is_missing) + + +def tenant_delete_wait_completed( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + iterations: int, +): + pageserver_http.tenant_delete(tenant_id=tenant_id) + wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations) + + +MANY_SMALL_LAYERS_TENANT_CONFIG = { + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{1024**2}", + "image_creation_threshold": "100", +} + + +def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int: + return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 8 diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 2c8b4f4303ae0..d03d2e7595f4c 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -6,13 +6,16 @@ import tarfile import time from pathlib import Path -from typing import Any, Callable, Dict, List, Tuple, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar from urllib.parse import urlencode import allure from psycopg2.extensions import cursor from fixtures.log_helper import log + +if TYPE_CHECKING: + from fixtures.neon_fixtures import PgBin from fixtures.types import TimelineId Fn = TypeVar("Fn", bound=Callable[..., Any]) @@ -300,17 +303,13 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn): raise Exception("timed out while waiting for %s" % func) from last_exception -def wait_while(number_of_iterations: int, interval: float, func): +def run_pg_bench_small(pg_bin: "PgBin", connstr: str): """ - Wait until 'func' returns false, or throws an exception. + Fast way to populate data. + For more layers consider combining with these tenant settings: + { + "checkpoint_distance": 1024 ** 2, + "image_creation_threshold": 100, + } """ - for i in range(number_of_iterations): - try: - if not func(): - return - log.info("waiting for %s iteration %s failed", func, i + 1) - time.sleep(interval) - continue - except Exception: - return - raise Exception("timed out while waiting for %s" % func) + pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr]) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index c4bee1a1ea9db..b208616345e90 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -82,6 +82,7 @@ def upload_files(env): # Test downloading remote extension. @pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) +@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_remote_extensions( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, @@ -148,6 +149,7 @@ def test_remote_extensions( # Test downloading remote library. @pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) +@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_remote_library( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, @@ -205,10 +207,11 @@ def test_remote_library( # Here we test a complex extension # which has multiple extensions in one archive # using postgis as an example -@pytest.mark.skipif( - RemoteStorageKind.REAL_S3 not in available_s3_storages(), - reason="skipping test because real s3 not enabled", -) +# @pytest.mark.skipif( +# RemoteStorageKind.REAL_S3 not in available_s3_storages(), +# reason="skipping test because real s3 not enabled", +# ) +@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_multiple_extensions_one_archive( neon_env_builder: NeonEnvBuilder, pg_version: PgVersion, @@ -253,6 +256,7 @@ def test_multiple_extensions_one_archive( # Run the test with mutliple simultaneous connections to an endpoint. # to ensure that the extension is downloaded only once. # +@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949") def test_extension_download_after_restart( neon_env_builder: NeonEnvBuilder, pg_version: PgVersion, diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index ee600db2c60f7..f26d04e2f3021 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -56,10 +56,6 @@ def test_pg_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - # checkpoint one more time to ensure that the lsn we get is the latest one - endpoint.safe_psql("CHECKPOINT") - - # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, endpoint) @@ -166,9 +162,4 @@ def test_sql_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - # checkpoint one more time to ensure that the lsn we get is the latest one - endpoint.safe_psql("CHECKPOINT") - endpoint.safe_psql("select pg_current_wal_insert_lsn()")[0][0] - - # Check that we restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, endpoint) diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 35334ec7b245b..fabec6b5bc3e0 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -269,7 +269,9 @@ def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" response = requests.post( f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", - data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))), + data=json.dumps( + {"queries": list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))} + ), headers={ "Content-Type": "application/sql", "Neon-Connection-String": connstr, diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 68116985d3f01..d642a6d19093c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -97,6 +97,11 @@ def test_remote_storage_backup_and_restore( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Thats because of UnreliableWrapper's injected failures + env.pageserver.allowed_errors.append( + f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*" + ) + checkpoint_numbers = range(1, 3) for checkpoint_number in checkpoint_numbers: @@ -168,9 +173,7 @@ def test_remote_storage_backup_and_restore( # # The initiated attach operation should survive the restart, and continue from where it was. env.pageserver.stop() - layer_download_failed_regex = ( - r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure" - ) + layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+" assert not env.pageserver.log_contains( layer_download_failed_regex ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" @@ -203,7 +206,7 @@ def test_remote_storage_backup_and_restore( == f"{data}|{checkpoint_number}" ) - log.info("ensure that we neede to retry downloads due to test_remote_failures=1") + log.info("ensure that we needed to retry downloads due to test_remote_failures=1") assert env.pageserver.log_contains(layer_download_failed_regex) diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index 494820ef8ef90..eb96a8faa47d4 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -33,8 +33,4 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): cur.execute(f"insert into t1 values ({i}, {j})") cur.execute("commit") - # force wal flush - cur.execute("checkpoint") - - # Check that we can restore the content of the datadir correctly check_restored_datadir_content(test_output_dir, env, endpoint) diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py new file mode 100644 index 0000000000000..f0f6cc743c180 --- /dev/null +++ b/test_runner/regress/test_tenant_delete.py @@ -0,0 +1,262 @@ +import enum +import os + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + last_flush_lsn_upload, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import PageserverApiException +from fixtures.pageserver.utils import ( + MANY_SMALL_LAYERS_TENANT_CONFIG, + assert_prefix_empty, + poll_for_remote_storage_iterations, + tenant_delete_wait_completed, + wait_tenant_status_404, + wait_until_tenant_active, + wait_until_tenant_state, +) +from fixtures.remote_storage import RemoteStorageKind, available_remote_storages +from fixtures.types import TenantId +from fixtures.utils import run_pg_bench_small + + +@pytest.mark.parametrize( + "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()] +) +def test_tenant_delete_smoke( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, + pg_bin: PgBin, +): + neon_env_builder.pageserver_config_override = "test_remote_failures=1" + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_tenant_delete_smoke", + ) + + env = neon_env_builder.init_start() + + ps_http = env.pageserver.http_client() + + # first try to delete non existing tenant + tenant_id = TenantId.generate() + env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") + with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"): + ps_http.tenant_delete(tenant_id=tenant_id) + + env.neon_cli.create_tenant( + tenant_id=tenant_id, + conf=MANY_SMALL_LAYERS_TENANT_CONFIG, + ) + + # create two timelines one being the parent of another + parent = None + for timeline in ["first", "second"]: + timeline_id = env.neon_cli.create_branch( + timeline, tenant_id=tenant_id, ancestor_branch_name=parent + ) + with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) + + parent = timeline + + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + + tenant_delete_wait_completed(ps_http, tenant_id, iterations) + + tenant_path = env.tenant_dir(tenant_id=tenant_id) + assert not tenant_path.exists() + + if remote_storage_kind in [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]: + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + +class Check(enum.Enum): + RETRY_WITHOUT_RESTART = enum.auto() + RETRY_WITH_RESTART = enum.auto() + + +FAILPOINTS = [ + "tenant-delete-before-shutdown", + "tenant-delete-before-create-remote-mark", + "tenant-delete-before-create-local-mark", + "tenant-delete-before-background", + "tenant-delete-before-polling-ongoing-deletions", + "tenant-delete-before-cleanup-remaining-fs-traces", + "tenant-delete-before-remove-timelines-dir", + "tenant-delete-before-remove-deleted-mark", + "tenant-delete-before-remove-tenant-dir", + # Some failpoints from timeline deletion + "timeline-delete-before-index-deleted-at", + "timeline-delete-before-rm", + "timeline-delete-before-index-delete", + "timeline-delete-after-rm-dir", +] + +FAILPOINTS_BEFORE_BACKGROUND = [ + "timeline-delete-before-schedule", + "tenant-delete-before-shutdown", + "tenant-delete-before-create-remote-mark", + "tenant-delete-before-create-local-mark", + "tenant-delete-before-background", +] + + +def combinations(): + result = [] + + remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3] + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): + remotes.append(RemoteStorageKind.REAL_S3) + + for remote_storage_kind in remotes: + for delete_failpoint in FAILPOINTS: + if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in ( + "timeline-delete-before-index-delete", + ): + # the above failpoint are not relevant for config without remote storage + continue + + # Simulate failures for only one type of remote storage + # to avoid log pollution and make tests run faster + if remote_storage_kind is RemoteStorageKind.MOCK_S3: + simulate_failures = True + else: + simulate_failures = False + result.append((remote_storage_kind, delete_failpoint, simulate_failures)) + return result + + +@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations()) +@pytest.mark.parametrize("check", list(Check)) +def test_delete_tenant_exercise_crash_safety_failpoints( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, + failpoint: str, + simulate_failures: bool, + check: Check, + pg_bin: PgBin, +): + if simulate_failures: + neon_env_builder.pageserver_config_override = "test_remote_failures=1" + + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints" + ) + + env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + + tenant_id = env.initial_tenant + + env.pageserver.allowed_errors.extend( + [ + # From deletion polling + f".*NotFound: tenant {env.initial_tenant}.*", + # allow errors caused by failpoints + f".*failpoint: {failpoint}", + # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped + ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + # We may leave some upload tasks in the queue. They're likely deletes. + # For uploads we explicitly wait with `last_flush_lsn_upload` below. + # So by ignoring these instead of waiting for empty upload queue + # we execute more distinct code paths. + '.*stopping left-over name="remote upload".*', + ] + ) + + ps_http = env.pageserver.http_client() + + timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id) + with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint: + # generate enough layers + run_pg_bench_small(pg_bin, endpoint.connstr()) + if remote_storage_kind is RemoteStorageKind.NOOP: + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + else: + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + + ps_http.configure_failpoints((failpoint, "return")) + + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + + # These failpoints are earlier than background task is spawned. + # so they result in api request failure. + if failpoint in FAILPOINTS_BEFORE_BACKGROUND: + with pytest.raises(PageserverApiException, match=failpoint): + ps_http.tenant_delete(tenant_id) + + else: + ps_http.tenant_delete(tenant_id) + tenant_info = wait_until_tenant_state( + pageserver_http=ps_http, + tenant_id=tenant_id, + expected_state="Broken", + iterations=iterations, + ) + + reason = tenant_info["state"]["data"]["reason"] + log.info(f"tenant broken: {reason}") + + # failpoint may not be the only error in the stack + assert reason.endswith(f"failpoint: {failpoint}"), reason + + if check is Check.RETRY_WITH_RESTART: + env.pageserver.stop() + env.pageserver.start() + + if ( + remote_storage_kind is RemoteStorageKind.NOOP + and failpoint == "tenant-delete-before-create-local-mark" + ): + tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) + elif failpoint in ( + "tenant-delete-before-shutdown", + "tenant-delete-before-create-remote-mark", + ): + wait_until_tenant_active( + ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25 + ) + tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) + else: + # Pageserver should've resumed deletion after restart. + wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10) + elif check is Check.RETRY_WITHOUT_RESTART: + # this should succeed + # this also checks that delete can be retried even when tenant is in Broken state + ps_http.configure_failpoints((failpoint, "off")) + + tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations) + + # Check remote is impty + if remote_storage_kind is RemoteStorageKind.MOCK_S3: + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + tenant_dir = env.tenant_dir(tenant_id) + # Check local is empty + assert not tenant_dir.exists() + + +# TODO test concurrent deletions with "hang" failpoint +# TODO test tenant delete continues after attach diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 932d8219543e0..b189510a9e412 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -66,6 +66,10 @@ def test_tenant_reattach( env.pageserver.allowed_errors.append( f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*" ) + # Thats because of UnreliableWrapper's injected failures + env.pageserver.allowed_errors.append( + f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*" + ) with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index eb020c101f4cd..32ad5381b45be 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -17,9 +17,9 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, - tenant_exists, wait_for_last_record_lsn, wait_for_upload, + wait_tenant_status_404, ) from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, available_remote_storages @@ -29,7 +29,6 @@ start_in_background, subprocess_capture, wait_until, - wait_while, ) @@ -269,11 +268,16 @@ def test_tenant_relocation( env = neon_env_builder.init_start() + tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209") + # FIXME: Is this expected? env.pageserver.allowed_errors.append( ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" ) + # Needed for detach polling. + env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") + # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage" @@ -283,9 +287,7 @@ def test_tenant_relocation( pageserver_http = env.pageserver.http_client() - tenant_id, initial_timeline_id = env.neon_cli.create_tenant( - TenantId("74ee8b079a0e437eb0afea7d26a07209") - ) + _, initial_timeline_id = env.neon_cli.create_tenant(tenant_id) log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) @@ -469,11 +471,8 @@ def test_tenant_relocation( pageserver_http.tenant_detach(tenant_id) # Wait a little, so that the detach operation has time to finish. - wait_while( - number_of_iterations=100, - interval=1, - func=lambda: tenant_exists(pageserver_http, tenant_id), - ) + wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1) + post_migration_check(ep_main, 500500, old_local_path_main) post_migration_check(ep_second, 1001000, old_local_path_second) diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 79a3b353d4586..fee95e5420c9d 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -146,6 +146,11 @@ def test_tenants_attached_after_download( tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + # Thats because of UnreliableWrapper's injected failures + env.pageserver.allowed_errors.append( + f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*" + ) + for checkpoint_number in range(1, 3): with endpoint.cursor() as cur: cur.execute( diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 1e8dd3620618e..26caeb8ffb689 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -4,7 +4,6 @@ import shutil import threading from pathlib import Path -from typing import Optional import pytest import requests @@ -18,6 +17,8 @@ ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( + assert_prefix_empty, + poll_for_remote_storage_iterations, timeline_delete_wait_completed, wait_for_last_record_lsn, wait_for_upload, @@ -27,7 +28,6 @@ ) from fixtures.remote_storage import ( RemoteStorageKind, - S3Storage, available_remote_storages, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -187,10 +187,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints( 8. Retry or restart without the failpoint and check the result. """ - if remote_storage_kind is not None: - neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints" - ) + neon_env_builder.enable_remote_storage( + remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints" + ) env = neon_env_builder.init_start( initial_tenant_conf={ @@ -231,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http.configure_failpoints((failpoint, "return")) - iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4 + iterations = poll_for_remote_storage_iterations(remote_storage_kind) # These failpoints are earlier than background task is spawned. # so they result in api request failure. @@ -280,14 +279,14 @@ def test_delete_timeline_exercise_crash_safety_failpoints( "remote_storage_s3_request_seconds_count", filter={"request_type": "get_object", "result": "err"}, ).value - == 1 + == 2 # One is missing tenant deletion mark, second is missing index part ) assert ( m.query_one( "remote_storage_s3_request_seconds_count", filter={"request_type": "get_object", "result": "ok"}, ).value - == 1 + == 1 # index part for initial timeline ) elif check is Check.RETRY_WITHOUT_RESTART: # this should succeed @@ -413,27 +412,6 @@ def test_timeline_resurrection_on_attach( assert all([tl["state"] == "Active" for tl in timelines]) -def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None): - # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api. - assert neon_env_builder.remote_storage_kind in ( - RemoteStorageKind.MOCK_S3, - RemoteStorageKind.REAL_S3, - ) - # For mypy - assert isinstance(neon_env_builder.remote_storage, S3Storage) - - # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. - assert neon_env_builder.remote_storage_client is not None - response = neon_env_builder.remote_storage_client.list_objects_v2( - Bucket=neon_env_builder.remote_storage.bucket_name, - Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "", - ) - objects = response.get("Contents") - assert ( - response["KeyCount"] == 0 - ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}" - - def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder): """ When deleting a timeline, if we succeed in setting the deleted flag remotely diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 87f076d236927..9132efe79f276 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -869,6 +869,49 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert debug_dump_1["config"]["id"] == env.safekeepers[0].id +class DummyConsumer(object): + def __call__(self, msg): + pass + + +def test_start_replication_term(neon_env_builder: NeonEnvBuilder): + """ + Test START_REPLICATION of uncommitted part specifying leader term. It must + error if safekeeper switched to different term. + """ + + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_start_replication_term") + endpoint = env.endpoints.create_start("test_start_replication_term") + + endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") + + # learn neon timeline from compute + tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) + timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0]) + + sk = env.safekeepers[0] + sk_http_cli = sk.http_client() + tli_status = sk_http_cli.timeline_status(tenant_id, timeline_id) + timeline_start_lsn = tli_status.timeline_start_lsn + + conn_opts = { + "host": "127.0.0.1", + "options": f"-c timeline_id={timeline_id} tenant_id={tenant_id}", + "port": sk.port.pg, + "connection_factory": psycopg2.extras.PhysicalReplicationConnection, + } + sk_pg_conn = psycopg2.connect(**conn_opts) # type: ignore + with sk_pg_conn.cursor() as cur: + # should fail, as first start has term 2 + cur.start_replication_expert(f"START_REPLICATION {timeline_start_lsn} (term='3')") + dummy_consumer = DummyConsumer() + with pytest.raises(psycopg2.errors.InternalError_) as excinfo: + cur.consume_stream(dummy_consumer) + assert "failed to acquire term 3" in str(excinfo.value) + + # Test auth on WAL service (postgres protocol) ports. def test_sk_auth(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 28bf5ccfa2fda..5d5cfee12783f 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 28bf5ccfa2fda9677566a25abd450e714d9ed055 +Subproject commit 5d5cfee12783f0989a9c9fe13bb40b5585812568 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 553f2d3618a6d..026d6b093d49e 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 553f2d3618a6d4893bde67f1c065926ee8a3a118 +Subproject commit 026d6b093d49e25cec44dd04598152329ceac027 diff --git a/vendor/revisions.json b/vendor/revisions.json index 80d161938effe..63b72cf5063f0 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,4 +1,4 @@ { - "postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118", - "postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055" + "postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027", + "postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568" }