Skip to content

Commit

Permalink
VirtualFile: add crash-safe overwrite abstraction & use it (#5186)
Browse files Browse the repository at this point in the history
(part of #4743)
(preliminary to #5180)
 
This PR adds a special-purpose API to `VirtualFile` for write-once
files.
It adopts it for `save_metadata` and `persist_tenant_conf`.

This is helpful for the asyncification efforts (#4743) and specifically
asyncification of `VirtualFile` because above two functions were the
only ones that needed the VirtualFile to be an `std::io::Write`.
(There was also `manifest.rs` that needed the `std::io::Write`, but, it
isn't used right now, and likely won't be used because we're taking a
different route for crash consistency, see #5172. So, let's remove it.
It'll be in Git history if we need to re-introduce it when picking up
the compaction work again; that's why it was introduced in the first
place).

We can't remove the `impl std::io::Write for VirtualFile` just yet
because of the `BufWriter` in

```rust
struct DeltaLayerWriterInner {
...
    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
}
```

But, @arpad-m and I have a plan to get rid of that by extracting the
append-only-ness-on-top-of-VirtualFile that #4994 added to
`EphemeralFile` into an abstraction that can be re-used in the
`DeltaLayerWriterInner` and `ImageLayerWriterInner`.
That'll be another PR.


### Performance Impact

This PR adds more fsyncs compared to before because we fsync the parent
directory every time.

1. For `save_metadata`, the additional fsyncs are unnecessary because we
know that `metadata` fits into a kernel page, and hence the write won't
be torn on the way into the kernel. However, the `metadata` file in
general is going to lose signficance very soon (=> see #5172), and the
NVMes in prod can definitely handle the additional fsync. So, let's not
worry about it.
2. For `persist_tenant_conf`, which we don't check to fit into a single
kernel page, this PR makes it actually crash-consistent. Before, we
could crash while writing out the tenant conf, leaving a prefix of the
tenant conf on disk.
  • Loading branch information
problame authored Sep 2, 2023
1 parent 41aa627 commit 7e81778
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 436 deletions.
97 changes: 22 additions & 75 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ use std::fmt::Debug;
use std::fmt::Display;
use std::fs;
use std::fs::File;
use std::fs::OpenOptions;
use std::io;
use std::io::Write;
use std::ops::Bound::Included;
use std::path::Path;
use std::path::PathBuf;
Expand Down Expand Up @@ -115,7 +113,6 @@ pub mod block_io;
pub mod disk_btree;
pub(crate) mod ephemeral_file;
pub mod layer_map;
pub mod manifest;
mod span;

pub mod metadata;
Expand Down Expand Up @@ -407,7 +404,6 @@ impl Tenant {
remote_startup_data: Option<RemoteStartupData>,
local_metadata: Option<TimelineMetadata>,
ancestor: Option<Arc<Timeline>>,
first_save: bool,
init_order: Option<&InitializationOrder>,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
Expand Down Expand Up @@ -441,14 +437,8 @@ impl Tenant {

// Save the metadata file to local disk.
if !picked_local {
save_metadata(
self.conf,
&tenant_id,
&timeline_id,
up_to_date_metadata,
first_save,
)
.context("save_metadata")?;
save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
.context("save_metadata")?;
}

let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
Expand Down Expand Up @@ -833,7 +823,6 @@ impl Tenant {
}),
local_metadata,
ancestor,
true,
None,
ctx,
)
Expand Down Expand Up @@ -1386,7 +1375,6 @@ impl Tenant {
remote_startup_data,
Some(local_metadata),
ancestor,
false,
init_order,
ctx,
)
Expand Down Expand Up @@ -2378,68 +2366,33 @@ impl Tenant {
tenant_id: &TenantId,
target_config_path: &Path,
tenant_conf: TenantConfOpt,
creating_tenant: bool,
) -> anyhow::Result<()> {
let _enter = info_span!("saving tenantconf").entered();

// imitate a try-block with a closure
let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
let target_config_parent = target_config_path.parent().with_context(|| {
format!(
"Config path does not have a parent: {}",
target_config_path.display()
)
})?;

info!("persisting tenantconf to {}", target_config_path.display());
info!("persisting tenantconf to {}", target_config_path.display());

let mut conf_content = r#"# This file contains a specific per-tenant's config.
let mut conf_content = r#"# This file contains a specific per-tenant's config.
# It is read in case of pageserver restart.
[tenant_config]
"#
.to_string();

// Convert the config to a toml file.
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;

let mut target_config_file = VirtualFile::open_with_options(
target_config_path,
OpenOptions::new()
.truncate(true) // This needed for overwriting with small config files
.write(true)
.create_new(creating_tenant)
// when creating a new tenant, first_save will be true and `.create(true)` will be
// ignored (per rust std docs).
//
// later when updating the config of created tenant, or persisting config for the
// first time for attached tenant, the `.create(true)` is used.
.create(true),
)?;

target_config_file
.write(conf_content.as_bytes())
.context("write toml bytes into file")
.and_then(|_| target_config_file.sync_all().context("fsync config file"))
.context("write config file")?;

// fsync the parent directory to ensure the directory entry is durable.
// before this was done conditionally on creating_tenant, but these management actions are rare
// enough to just fsync it always.

crashsafe::fsync(target_config_parent)?;
// XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
Ok(())
};
.to_string();

// this function is called from creating the tenant and updating the tenant config, which
// would otherwise share this context, so keep it here in one place.
do_persist(target_config_path).with_context(|| {
format!(
"write tenant {tenant_id} config to {}",
target_config_path.display()
)
})
// Convert the config to a toml file.
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;

let conf_content = conf_content.as_bytes();

let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
.with_context(|| {
format!(
"write tenant {tenant_id} config to {}",
target_config_path.display()
)
})?;
Ok(())
}

//
Expand Down Expand Up @@ -2968,14 +2921,8 @@ impl Tenant {
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
});

save_metadata(
self.conf,
&self.tenant_id,
new_timeline_id,
new_metadata,
true,
)
.context("Failed to create timeline metadata")?;
save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata)
.context("Failed to create timeline metadata")?;
Ok(())
}

Expand Down Expand Up @@ -3215,7 +3162,7 @@ fn try_create_target_tenant_dir(
)
.with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;

Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf)?;

crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
format!(
Expand Down
Loading

1 comment on commit 7e81778

@github-actions
Copy link

@github-actions github-actions bot commented on 7e81778 Sep 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1692 tests run: 1613 passed, 0 failed, 79 skipped (full report)


Flaky tests (1)

Postgres 14

  • test_pgbench_intensive_init_workload[vanilla-1000]: release
The comment gets automatically updated with the latest test results
7e81778 at 2023-09-02T11:01:59.958Z :recycle:

Please sign in to comment.