From 32358cfd202942235915cf94efb57f6d814a7dfe Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 1 Mar 2024 11:33:37 +0000 Subject: [PATCH 1/2] layer file creation: remove redundant fsync()s The `writer.finish()` methods already fsync the inode, using `VirtualFile::sync_all()`. All that the callers need to do is fsync their directory, i.e., the timeline directory. Note that there's a call in the new compaction code that is apparently dead-at-runtime, so, I couldn't fix up any fsyncs there [Link](https://github.com/neondatabase/neon/blob/502b69b33bbd4ad1b0647e921a9c665249a2cd62/pageserver/src/tenant/timeline/compaction.rs#L204-L211). In the grand scheme of things, layer durability probably doesn't matter anymore because the remote storage is authoritative at all times as of #5198. But, let's not break the discipline in htis commit. part of https://github.com/neondatabase/neon/issues/6663 --- pageserver/src/tenant/timeline.rs | 63 ++++++------------------------- 1 file changed, 11 insertions(+), 52 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 206f20306e5e..0c03ef33c344 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -10,7 +10,7 @@ mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; use futures::stream::StreamExt; @@ -3422,26 +3422,10 @@ impl Timeline { let _g = span.entered(); let new_delta = Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?; - let new_delta_path = new_delta.local_path().to_owned(); - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable. - // - // NB: timeline dir must be synced _after_ the file contents are durable. - // So, two separate fsyncs are required, they mustn't be batched. - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, the fsync overhead can be reduces as follows: - // 1. write them all to temporary file names - // 2. fsync them - // 3. rename to the final name - // 4. fsync the parent directory. - // Note that (1),(2),(3) today happen inside write_to_disk(). - // - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. par_fsync::par_fsync(&[self_clone .conf .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) @@ -3674,25 +3658,10 @@ impl Timeline { } } - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let all_paths = image_layers - .iter() - .map(|layer| layer.local_path().to_owned()) - .collect::>(); - - par_fsync::par_fsync_async(&all_paths) - .await - .context("fsync of newly created layer files")?; - - if !all_paths.is_empty() { + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + if !image_layers.is_empty() { par_fsync::par_fsync_async(&[self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) @@ -4279,22 +4248,12 @@ impl Timeline { } } - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - let layer_paths: Vec = new_layers - .iter() - .map(|l| l.local_path().to_owned()) - .collect(); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync_async(&layer_paths) - .await - .context("fsync all new layers")?; - + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. let timeline_dir = self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id); - par_fsync::par_fsync_async(&[timeline_dir]) .await .context("fsync of timeline dir")?; From ca2ed0b84c9d10a173c2cf0120d3ca5fb19ac241 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 1 Mar 2024 12:31:38 +0000 Subject: [PATCH 2/2] layer file creation: fatal_err the timeline dir fsync As pointed out in the comments added in this PR: the in-memory state of the filesystem already has the layer file in its final place. If the fsync fails, but pageserver continues to execute, it's quite easy for subsequent pageserver code to observe the file being there and assume it's durable, where it really isn't. It can happen that we get ENOSPC during the fsync. However, 1. the timeline dir is small (remember, the big layer _file_ has already been synced). Small data means ENOSPC due to delayed allocation races etc are less likely. 2. what elase are we going to do in that case? If we decide to bubble up the error, the file remains on disk. We could try to unlink it and fsync after the unlink. If that fails, we would _definitely_ need to error out. Is it worth the trouble though? Side note: all this logic about not carrying on after fsync failure implies that we `sync` the filesystem successfully before we restart the pageserver. Our systemd unit currently does not do that, but should. --- pageserver/src/tenant/timeline.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0c03ef33c344..0a2ae5d8bdfa 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -50,7 +50,6 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::sync::gate::{Gate, GateGuard}; -use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, @@ -75,6 +74,10 @@ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{ + pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, + virtual_file::MaybeFatalIo, +}; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; @@ -3426,10 +3429,14 @@ impl Timeline { // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. + // + // We use fatal_err() below because the after write_to_disk returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. par_fsync::par_fsync(&[self_clone .conf .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) - .context("fsync of timeline dir")?; + .fatal_err("fsync of timeline dir"); anyhow::Ok(new_delta) } @@ -3662,11 +3669,14 @@ impl Timeline { // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. if !image_layers.is_empty() { + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. par_fsync::par_fsync_async(&[self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) .await - .context("fsync of timeline dir")?; + .fatal_err("fsync of timeline dir"); } let mut guard = self.layers.write().await; @@ -4251,12 +4261,16 @@ impl Timeline { // The writer.finish() above already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. + // + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. let timeline_dir = self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id); par_fsync::par_fsync_async(&[timeline_dir]) .await - .context("fsync of timeline dir")?; + .fatal_err("fsync of timeline dir"); } stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();