Skip to content

Commit

Permalink
layer file download: final rename: fix durability
Browse files Browse the repository at this point in the history
Before this PR, the layer file download code would fsync the inode after
rename instead of the timeline directory. That is not in line with what
a comment further up says we're doing, and it's obviously not achieving
the goal of making the rename durable.

part of #6663
  • Loading branch information
problame committed Mar 1, 2024
1 parent 54586d6 commit 68a4668
Showing 1 changed file with 13 additions and 8 deletions.
21 changes: 13 additions & 8 deletions pageserver/src/tenant/remote_timeline_client/download.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ use tokio::io::{AsyncSeekExt, AsyncWriteExt};
use tokio_util::io::StreamReader;
use tokio_util::sync::CancellationToken;
use tracing::warn;
use utils::{backoff, crashsafe};
use utils::backoff;

use crate::config::PageServerConf;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::Generation;
use crate::virtual_file::on_fatal_io_error;
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
use crate::TEMP_FILE_SUFFIX;
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
use utils::crashsafe::path_with_suffix_extension;
Expand Down Expand Up @@ -50,9 +50,8 @@ pub async fn download_layer_file<'a>(
) -> Result<u64, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();

let local_path = conf
.timeline_path(&tenant_shard_id, &timeline_id)
.join(layer_file_name.file_name());
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
let local_path = timeline_path.join(layer_file_name.file_name());

let remote_path = remote_layer_path(
&tenant_shard_id.tenant_id,
Expand Down Expand Up @@ -149,10 +148,16 @@ pub async fn download_layer_file<'a>(
.with_context(|| format!("rename download layer file to {local_path}"))
.map_err(DownloadError::Other)?;

crashsafe::fsync_async(&local_path)
// We use fatal_err() below because the after the rename above,
// the in-memory state of the filesystem already has the layer file in its final place,
// and subsequent pageserver code could think it's durable while it really isn't.
let timeline_dir = VirtualFile::open(&timeline_path)
.await
.with_context(|| format!("fsync layer file {local_path}"))
.map_err(DownloadError::Other)?;
.fatal_err("VirtualFile::open for timeline dir fsync");
timeline_dir
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");

tracing::debug!("download complete: {local_path}");

Expand Down

0 comments on commit 68a4668

Please sign in to comment.