Skip to content

Commit

Permalink
Support tenant manifests in the scrubber (#9942)
Browse files Browse the repository at this point in the history
Support tenant manifests in the storage scrubber:

* list the manifests, order them by generation
* delete all manifests except for the two most recent generations
* for the latest manifest: try parsing it.

I've tested this patch by running the against a staging bucket and it
successfully deleted stuff (and avoided deleting the latest two
generations).

In follow-up work, we might want to also check some invariants of the
manifest, as mentioned in #8088.

Part of #9386
Part of #8088

---------

Co-authored-by: Christian Schwarz <[email protected]>
  • Loading branch information
arpad-m and problame authored Dec 3, 2024
1 parent 9ef0662 commit ca85f36
Show file tree
Hide file tree
Showing 5 changed files with 459 additions and 54 deletions.
4 changes: 2 additions & 2 deletions pageserver/src/tenant/remote_timeline_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2564,9 +2564,9 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
}

/// Given the key of a tenant manifest, parse out the generation number
pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
let re = RE.get_or_init(|| Regex::new(r".*tenant-manifest-([0-9a-f]{8}).json").unwrap());
re.captures(path.get_path().as_str())
.and_then(|c| c.get(1))
.and_then(|m| Generation::parse_suffix(m.as_str()))
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/tenant/remote_timeline_client/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ impl TenantManifest {
offloaded_timelines: vec![],
}
}
pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
serde_json::from_slice::<Self>(bytes)
}

Expand Down
135 changes: 134 additions & 1 deletion storage_scrubber/src/checks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,21 @@ use itertools::Itertools;
use pageserver::tenant::checks::check_valid_layermap;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
use pageserver::tenant::remote_timeline_client::manifest::TenantManifest;
use pageserver_api::shard::ShardIndex;
use tokio_util::sync::CancellationToken;
use tracing::{info, warn};
use utils::generation::Generation;
use utils::id::TimelineId;
use utils::shard::TenantShardId;

use crate::cloud_admin_api::BranchData;
use crate::metadata_stream::stream_listing;
use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
use futures_util::StreamExt;
use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
use pageserver::tenant::remote_timeline_client::{
parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
};
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::IndexPart;
use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
Expand Down Expand Up @@ -527,3 +531,132 @@ async fn list_timeline_blobs_impl(
unknown_keys,
}))
}

pub(crate) struct RemoteTenantManifestInfo {
pub(crate) latest_generation: Option<Generation>,
pub(crate) manifests: Vec<(Generation, ListingObject)>,
}

pub(crate) enum ListTenantManifestResult {
WithErrors {
errors: Vec<(String, String)>,
#[allow(dead_code)]
unknown_keys: Vec<ListingObject>,
},
NoErrors(RemoteTenantManifestInfo),
}

/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
pub(crate) async fn list_tenant_manifests(
remote_client: &GenericRemoteStorage,
tenant_id: TenantShardId,
root_target: &RootTarget,
) -> anyhow::Result<ListTenantManifestResult> {
let mut errors = Vec::new();
let mut unknown_keys = Vec::new();

let mut tenant_root_target = root_target.tenant_root(&tenant_id);
let original_prefix = tenant_root_target.prefix_in_bucket.clone();
const TENANT_MANIFEST_STEM: &str = "tenant-manifest";
tenant_root_target.prefix_in_bucket += TENANT_MANIFEST_STEM;
tenant_root_target.delimiter = String::new();

let mut manifests: Vec<(Generation, ListingObject)> = Vec::new();

let prefix_str = &original_prefix
.strip_prefix("/")
.unwrap_or(&original_prefix);

let mut stream = std::pin::pin!(stream_listing(remote_client, &tenant_root_target));
'outer: while let Some(obj) = stream.next().await {
let (key, Some(obj)) = obj? else {
panic!("ListingObject not specified");
};

'err: {
// TODO a let chain would be nicer here.
let Some(name) = key.object_name() else {
break 'err;
};
if !name.starts_with(TENANT_MANIFEST_STEM) {
break 'err;
}
let Some(generation) = parse_remote_tenant_manifest_path(key.clone()) else {
break 'err;
};
tracing::debug!("tenant manifest {key}");
manifests.push((generation, obj));
continue 'outer;
}
tracing::info!("Listed an unknown key: {key}");
unknown_keys.push(obj);
}

if manifests.is_empty() {
tracing::debug!("No manifest for timeline.");

return Ok(ListTenantManifestResult::WithErrors {
errors,
unknown_keys,
});
}
if !unknown_keys.is_empty() {
errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));

return Ok(ListTenantManifestResult::WithErrors {
errors,
unknown_keys,
});
}

// Find the manifest with the highest generation
let (latest_generation, latest_listing_object) = manifests
.iter()
.max_by_key(|i| i.0)
.map(|(g, obj)| (*g, obj.clone()))
.unwrap();

let manifest_bytes =
match download_object_with_retries(remote_client, &latest_listing_object.key).await {
Ok(bytes) => bytes,
Err(e) => {
// It is possible that the tenant gets deleted in-between we list the objects
// and we download the manifest file.
errors.push((
latest_listing_object.key.get_path().as_str().to_owned(),
format!("failed to download tenant-manifest.json: {e}"),
));
return Ok(ListTenantManifestResult::WithErrors {
errors,
unknown_keys,
});
}
};

match TenantManifest::from_json_bytes(&manifest_bytes) {
Ok(_manifest) => {
return Ok(ListTenantManifestResult::NoErrors(
RemoteTenantManifestInfo {
latest_generation: Some(latest_generation),
manifests,
},
));
}
Err(parse_error) => errors.push((
latest_listing_object.key.get_path().as_str().to_owned(),
format!("tenant-manifest.json body parsing error: {parse_error}"),
)),
}

if errors.is_empty() {
errors.push((
(*prefix_str).to_owned(),
"Unexpected: no errors did not lead to a successfully parsed blob return".to_string(),
));
}

Ok(ListTenantManifestResult::WithErrors {
errors,
unknown_keys,
})
}
Loading

1 comment on commit ca85f36

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7155 tests run: 6837 passed, 0 failed, 318 skipped (full report)


Flaky tests (3)

Postgres 16

Postgres 15

Postgres 14

Code coverage* (full report)

  • functions: 30.8% (8264 of 26844 functions)
  • lines: 47.8% (65163 of 136375 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
ca85f36 at 2024-12-03T22:35:06.949Z :recycle:

Please sign in to comment.