Skip to content

Commit

Permalink
pageserver: generation-aware storage for TenantManifest (#9555)
Browse files Browse the repository at this point in the history
## Problem

When tenant manifest objects are written without a generation suffix,
concurrently attached pageservers may stamp on each others writes of the
manifest and cause undefined behavior.

Closes: #9543 

## Summary of changes

- Use download_generation_object helper when reading manifests, to
search for the most recent generation
- Use Tenant::generation as the generation suffix when writing
manifests.
  • Loading branch information
jcsp authored Oct 29, 2024
1 parent b77b9bd commit 8e2e9f0
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 28 deletions.
9 changes: 4 additions & 5 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1352,14 +1352,15 @@ impl Tenant {
)
.await?;
let (offloaded_add, tenant_manifest) =
match remote_timeline_client::do_download_tenant_manifest(
match remote_timeline_client::download_tenant_manifest(
remote_storage,
&self.tenant_shard_id,
self.generation,
&cancel,
)
.await
{
Ok((tenant_manifest, _generation)) => (
Ok((tenant_manifest, _generation, _manifest_mtime)) => (
format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
tenant_manifest,
),
Expand Down Expand Up @@ -3130,8 +3131,6 @@ impl Tenant {
}

let tenant_manifest = self.build_tenant_manifest();
// TODO: generation support
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
for child_shard in child_shards {
tracing::info!(
"Uploading tenant manifest for child {}",
Expand All @@ -3140,7 +3139,7 @@ impl Tenant {
upload_tenant_manifest(
&self.remote_storage,
child_shard,
generation,
self.generation,
&tenant_manifest,
&self.cancel,
)
Expand Down
26 changes: 18 additions & 8 deletions pageserver/src/tenant/remote_timeline_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ use chrono::{NaiveDateTime, Utc};
pub(crate) use download::download_initdb_tar_zst;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::shard::{ShardIndex, TenantShardId};
use regex::Regex;
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use utils::backoff::{
Expand All @@ -199,7 +200,7 @@ use utils::pausable_failpoint;

use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use std::sync::{Arc, Mutex, OnceLock};
use std::time::Duration;

use remote_storage::{
Expand Down Expand Up @@ -245,7 +246,7 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
use super::Generation;

pub(crate) use download::{
do_download_tenant_manifest, download_index_part, is_temp_download_file,
download_index_part, download_tenant_manifest, is_temp_download_file,
list_remote_tenant_shards, list_remote_timelines,
};
pub(crate) use index::LayerFileMetadata;
Expand Down Expand Up @@ -274,12 +275,6 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
/// which we warn and skip.
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);

/// Hardcode a generation for the tenant manifest for now so that we don't
/// need to deal with generation-less manifests in the future.
///
/// TODO: add proper generation support to all the places that use this.
pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);

pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
Deleted(IndexPart),
Expand Down Expand Up @@ -2239,6 +2234,12 @@ pub fn remote_tenant_manifest_path(
RemotePath::from_string(&path).expect("Failed to construct path")
}

/// Prefix to all generations' manifest objects in a tenant shard
pub fn remote_tenant_manifest_prefix(tenant_shard_id: &TenantShardId) -> RemotePath {
let path = format!("tenants/{tenant_shard_id}/tenant-manifest",);
RemotePath::from_string(&path).expect("Failed to construct path")
}

pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
RemotePath::from_string(&path).expect("Failed to construct path")
Expand Down Expand Up @@ -2333,6 +2334,15 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
}
}

/// Given the key of a tenant manifest, parse out the generation number
pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
static RE: OnceLock<Regex> = OnceLock::new();
let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
re.captures(path.get_path().as_str())
.and_then(|c| c.get(1))
.and_then(|m| Generation::parse_suffix(m.as_str()))
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
59 changes: 44 additions & 15 deletions pageserver/src/tenant/remote_timeline_client/download.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ use utils::backoff;

use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id,
};
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::LayerName;
use crate::tenant::Generation;
Expand All @@ -36,9 +38,10 @@ use utils::pausable_failpoint;
use super::index::{IndexPart, LayerFileMetadata};
use super::manifest::TenantManifest;
use super::{
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path,
remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path,
remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
};

///
Expand Down Expand Up @@ -365,32 +368,34 @@ async fn do_download_remote_path_retry_forever(
.await
}

pub async fn do_download_tenant_manifest(
async fn do_download_tenant_manifest(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
_timeline_id: Option<&TimelineId>,
generation: Generation,
cancel: &CancellationToken,
) -> Result<(TenantManifest, Generation), DownloadError> {
// TODO: generation support
let generation = super::TENANT_MANIFEST_GENERATION;
) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);

let (manifest_bytes, _manifest_bytes_mtime) =
let (manifest_bytes, manifest_bytes_mtime) =
do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;

let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
.with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
.map_err(DownloadError::Other)?;

Ok((tenant_manifest, generation))
Ok((tenant_manifest, generation, manifest_bytes_mtime))
}

async fn do_download_index_part(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
timeline_id: Option<&TimelineId>,
index_generation: Generation,
cancel: &CancellationToken,
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
let timeline_id =
timeline_id.expect("A timeline ID is always provided when downloading an index");
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

let (index_part_bytes, index_part_mtime) =
Expand Down Expand Up @@ -426,7 +431,7 @@ async fn do_download_index_part(
pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
storage: &'a GenericRemoteStorage,
tenant_shard_id: &'a TenantShardId,
timeline_id: &'a TimelineId,
timeline_id: Option<&'a TimelineId>,
my_generation: Generation,
what: &str,
prefix: RemotePath,
Expand All @@ -438,15 +443,15 @@ where
DF: Fn(
&'a GenericRemoteStorage,
&'a TenantShardId,
&'a TimelineId,
Option<&'a TimelineId>,
Generation,
&'a CancellationToken,
) -> DFF,
DFF: Future<Output = Result<(T, Generation, SystemTime), DownloadError>>,
PF: Fn(RemotePath) -> Option<Generation>,
T: 'static,
{
debug_assert_current_span_has_tenant_and_timeline_id();
debug_assert_current_span_has_tenant_id();

if my_generation.is_none() {
// Operating without generations: just fetch the generation-less path
Expand Down Expand Up @@ -552,11 +557,13 @@ pub(crate) async fn download_index_part(
my_generation: Generation,
cancel: &CancellationToken,
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();

let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
download_generation_object(
storage,
tenant_shard_id,
timeline_id,
Some(timeline_id),
my_generation,
"index_part",
index_prefix,
Expand All @@ -567,6 +574,28 @@ pub(crate) async fn download_index_part(
.await
}

pub(crate) async fn download_tenant_manifest(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
my_generation: Generation,
cancel: &CancellationToken,
) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
let manifest_prefix = remote_tenant_manifest_prefix(tenant_shard_id);

download_generation_object(
storage,
tenant_shard_id,
None,
my_generation,
"tenant-manifest",
manifest_prefix,
do_download_tenant_manifest,
parse_remote_tenant_manifest_path,
cancel,
)
.await
}

pub(crate) async fn download_initdb_tar_zst(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,
Expand Down

1 comment on commit 8e2e9f0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5416 tests run: 5183 passed, 1 failed, 232 skipped (full report)


Failures on Postgres 16

  • test_basebackup_with_high_slru_count[github-actions-selfhosted-10-13-30]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_basebackup_with_high_slru_count[release-pg16-github-actions-selfhosted-10-13-30]"

Code coverage* (full report)

  • functions: 31.5% (7760 of 24658 functions)
  • lines: 49.0% (61019 of 124532 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
8e2e9f0 at 2024-10-29T23:49:24.401Z :recycle:

Please sign in to comment.