Skip to content

Commit

Permalink
tests: add test_timeline_archival_chaos (#9609)
Browse files Browse the repository at this point in the history
## Problem

- We lack test coverage of cases where multiple timelines fight for
updates to the same manifest
(#9557), and in timeline
archival changes while dual-attached
(#9555)

## Summary of changes

- Add a chaos test for timeline creation->archival->offload->deletion
  • Loading branch information
jcsp authored Nov 14, 2024
1 parent 49b599c commit 93939f1
Show file tree
Hide file tree
Showing 7 changed files with 340 additions and 33 deletions.
1 change: 1 addition & 0 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
.into_boxed_str(),
),
a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
Cancelled => ApiError::ResourceUnavailable("shutting down".into()),
Other(e) => ApiError::InternalServerError(e),
}
}
Expand Down
4 changes: 4 additions & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,9 @@ pub enum DeleteTimelineError {
#[error("Timeline deletion is already in progress")]
AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),

#[error("Cancelled")]
Cancelled,

#[error(transparent)]
Other(#[from] anyhow::Error),
}
Expand All @@ -710,6 +713,7 @@ impl Debug for DeleteTimelineError {
Self::NotFound => write!(f, "NotFound"),
Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
Self::Cancelled => f.debug_tuple("Cancelled").finish(),
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
}
}
Expand Down
40 changes: 26 additions & 14 deletions pageserver/src/tenant/remote_timeline_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ use self::index::IndexPart;
use super::metadata::MetadataUpdate;
use super::storage_layer::{Layer, LayerName, ResidentLayer};
use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
use super::Generation;
use super::{DeleteTimelineError, Generation};

pub(crate) use download::{
download_index_part, download_tenant_manifest, is_temp_download_file,
Expand Down Expand Up @@ -1550,15 +1550,17 @@ impl RemoteTimelineClient {
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
/// deletes leaked files if any and proceeds with deletion of index file at the end.
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
pub(crate) async fn delete_all(self: &Arc<Self>) -> Result<(), DeleteTimelineError> {
debug_assert_current_span_has_tenant_and_timeline_id();

let layers: Vec<RemotePath> = {
let mut locked = self.upload_queue.lock().unwrap();
let stopped = locked.stopped_mut()?;
let stopped = locked.stopped_mut().map_err(DeleteTimelineError::Other)?;

if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
anyhow::bail!("deleted_at is not set")
return Err(DeleteTimelineError::Other(anyhow::anyhow!(
"deleted_at is not set"
)));
}

debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
Expand Down Expand Up @@ -1593,23 +1595,29 @@ impl RemoteTimelineClient {
};

let layer_deletion_count = layers.len();
self.deletion_queue_client.push_immediate(layers).await?;
self.deletion_queue_client
.push_immediate(layers)
.await
.map_err(|_| DeleteTimelineError::Cancelled)?;

// Delete the initdb.tar.zst, which is not always present, but deletion attempts of
// inexistant objects are not considered errors.
let initdb_path =
remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
self.deletion_queue_client
.push_immediate(vec![initdb_path])
.await?;
.await
.map_err(|_| DeleteTimelineError::Cancelled)?;

// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);

// Execute all pending deletions, so that when we proceed to do a listing below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
self.flush_deletion_queue().await?;
self.flush_deletion_queue()
.await
.map_err(|_| DeleteTimelineError::Cancelled)?;

let cancel = shutdown_token();

Expand Down Expand Up @@ -1672,28 +1680,32 @@ impl RemoteTimelineClient {
if !remaining_layers.is_empty() {
self.deletion_queue_client
.push_immediate(remaining_layers)
.await?;
.await
.map_err(|_| DeleteTimelineError::Cancelled)?;
}

fail::fail_point!("timeline-delete-before-index-delete", |_| {
Err(anyhow::anyhow!(
Err(DeleteTimelineError::Other(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-delete"
))?
)))?
});

debug!("enqueuing index part deletion");
self.deletion_queue_client
.push_immediate([latest_index].to_vec())
.await?;
.await
.map_err(|_| DeleteTimelineError::Cancelled)?;

// Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
// for a flush to a persistent deletion list so that we may be sure deletion will occur.
self.flush_deletion_queue().await?;
self.flush_deletion_queue()
.await
.map_err(|_| DeleteTimelineError::Cancelled)?;

fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
Err(DeleteTimelineError::Other(anyhow::anyhow!(
"failpoint: timeline-delete-after-index-delete"
))?
)))?
});

info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
Expand Down
44 changes: 28 additions & 16 deletions pageserver/src/tenant/timeline/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::{

use anyhow::Context;
use pageserver_api::{models::TimelineState, shard::TenantShardId};
use remote_storage::DownloadError;
use tokio::sync::OwnedMutexGuard;
use tracing::{error, info, info_span, instrument, Instrument};
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
Expand All @@ -16,7 +17,7 @@ use crate::{
metadata::TimelineMetadata,
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
TimelineOrOffloaded,
TenantManifestError, TimelineOrOffloaded,
},
virtual_file::MaybeFatalIo,
};
Expand Down Expand Up @@ -110,13 +111,6 @@ pub(super) async fn delete_local_timeline_directory(
info!("finished deleting layer files, releasing locks");
}

/// Removes remote layers and an index file after them.
async fn delete_remote_layers_and_index(
remote_client: &Arc<RemoteTimelineClient>,
) -> anyhow::Result<()> {
remote_client.delete_all().await.context("delete_all")
}

/// It is important that this gets called when DeletionGuard is being held.
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
async fn remove_maybe_offloaded_timeline_from_tenant(
Expand Down Expand Up @@ -221,11 +215,24 @@ impl DeleteTimelineFlow {
None => {
let remote_client = tenant
.build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
let result = remote_client
let result = match remote_client
.download_index_file(&tenant.cancel)
.instrument(info_span!("download_index_file"))
.await
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
{
Ok(r) => r,
Err(DownloadError::NotFound) => {
// Deletion is already complete
tracing::info!("Timeline already deleted in remote storage");
return Ok(());
}
Err(e) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(
"error: {:?}",
e
)));
}
};
let index_part = match result {
MaybeDeletedIndexPart::Deleted(p) => {
tracing::info!("Timeline already set as deleted in remote index");
Expand Down Expand Up @@ -406,7 +413,12 @@ impl DeleteTimelineFlow {
"timeline_delete",
async move {
if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
error!("Error: {err:#}");
// Only log as an error if it's not a cancellation.
if matches!(err, DeleteTimelineError::Cancelled) {
info!("Shutdown during timeline deletion");
}else {
error!("Error: {err:#}");
}
if let TimelineOrOffloaded::Timeline(timeline) = timeline {
timeline.set_broken(format!("{err:#}"))
}
Expand Down Expand Up @@ -438,7 +450,7 @@ impl DeleteTimelineFlow {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
});

delete_remote_layers_and_index(&remote_client).await?;
remote_client.delete_all().await?;

pausable_failpoint!("in_progress_delete");

Expand All @@ -449,10 +461,10 @@ impl DeleteTimelineFlow {
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
// However, we handle this case in tenant loading code so the next time we attach, the issue is
// resolved.
tenant
.store_tenant_manifest()
.await
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
tenant.store_tenant_manifest().await.map_err(|e| match e {
TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
_ => DeleteTimelineError::Other(e.into()),
})?;

*guard = Self::Finished;

Expand Down
1 change: 1 addition & 0 deletions storage_controller/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3642,6 +3642,7 @@ impl Service {
match res {
Ok(ok) => Ok(ok),
Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
Err(mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg)) => Err(ApiError::ResourceUnavailable(msg.into())),
Err(e) => {
Err(
ApiError::InternalServerError(anyhow::anyhow!(
Expand Down
22 changes: 21 additions & 1 deletion test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2379,6 +2379,17 @@ def __init__(self, env: NeonEnv, id: int, port: PageserverPort, az_id: str):
#
# The entries in the list are regular experessions.
self.allowed_errors: list[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
# Store persistent failpoints that should be reapplied on each start
self._persistent_failpoints: dict[str, str] = {}

def add_persistent_failpoint(self, name: str, action: str):
"""
Add a failpoint that will be automatically reapplied each time the pageserver starts.
The failpoint will be set immediately if the pageserver is running.
"""
self._persistent_failpoints[name] = action
if self.running:
self.http_client().configure_failpoints([(name, action)])

def timeline_dir(
self,
Expand Down Expand Up @@ -2446,6 +2457,15 @@ def start(
"""
assert self.running is False

if self._persistent_failpoints:
# Tests shouldn't use this mechanism _and_ set FAILPOINTS explicitly
assert extra_env_vars is None or "FAILPOINTS" not in extra_env_vars
if extra_env_vars is None:
extra_env_vars = {}
extra_env_vars["FAILPOINTS"] = ",".join(
f"{k}={v}" for (k, v) in self._persistent_failpoints.items()
)

storage = self.env.pageserver_remote_storage
if isinstance(storage, S3Storage):
s3_env_vars = storage.access_env_vars()
Expand Down Expand Up @@ -4522,7 +4542,7 @@ def pytest_addoption(parser: Parser):


SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
)


Expand Down
Loading

1 comment on commit 93939f1

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5509 tests run: 5253 passed, 3 failed, 253 skipped (full report)


Failures on Postgres 16

  • test_sharded_ingest[github-actions-selfhosted-1]: release-x86-64
  • test_compaction_l0_memory[github-actions-selfhosted]: release-x86-64
  • test_storage_controller_many_tenants[github-actions-selfhosted]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_sharded_ingest[release-pg16-github-actions-selfhosted-1] or test_compaction_l0_memory[release-pg16-github-actions-selfhosted] or test_storage_controller_many_tenants[release-pg16-github-actions-selfhosted]"

Code coverage* (full report)

  • functions: 31.8% (7892 of 24840 functions)
  • lines: 49.4% (62459 of 126373 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
93939f1 at 2024-11-14T19:24:52.774Z :recycle:

Please sign in to comment.