From 3fd77eb0d46dba7de3bd51ada2a7c46f56fd6f72 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 4 Mar 2024 12:33:42 +0100 Subject: [PATCH] layer file creation: remove redundant fsync()s (#6983) The `writer.finish()` methods already fsync the inode, using `VirtualFile::sync_all()`. All that the callers need to do is fsync their directory, i.e., the timeline directory. Note that there's a call in the new compaction code that is apparently dead-at-runtime, so, I couldn't fix up any fsyncs there [Link](https://github.com/neondatabase/neon/blob/502b69b33bbd4ad1b0647e921a9c665249a2cd62/pageserver/src/tenant/timeline/compaction.rs#L204-L211). Note that layer durability still matters somewhat, even after #5198 which made remote storage authoritative. We do have the layer file length as an indicator, but no checksums on the layer file contents. So, a series of overwrites without fsyncs in the middle, plus a subsequent crash, could cause us to end up in a state where the file length matches but the contents are garbage. part of https://github.com/neondatabase/neon/issues/6663 --- pageserver/src/tenant/timeline.rs | 63 ++++++------------------------- 1 file changed, 11 insertions(+), 52 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 206f20306e5e..0c03ef33c344 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -10,7 +10,7 @@ mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; use futures::stream::StreamExt; @@ -3422,26 +3422,10 @@ impl Timeline { let _g = span.entered(); let new_delta = Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?; - let new_delta_path = new_delta.local_path().to_owned(); - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable. - // - // NB: timeline dir must be synced _after_ the file contents are durable. - // So, two separate fsyncs are required, they mustn't be batched. - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, the fsync overhead can be reduces as follows: - // 1. write them all to temporary file names - // 2. fsync them - // 3. rename to the final name - // 4. fsync the parent directory. - // Note that (1),(2),(3) today happen inside write_to_disk(). - // - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. par_fsync::par_fsync(&[self_clone .conf .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) @@ -3674,25 +3658,10 @@ impl Timeline { } } - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let all_paths = image_layers - .iter() - .map(|layer| layer.local_path().to_owned()) - .collect::>(); - - par_fsync::par_fsync_async(&all_paths) - .await - .context("fsync of newly created layer files")?; - - if !all_paths.is_empty() { + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + if !image_layers.is_empty() { par_fsync::par_fsync_async(&[self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) @@ -4279,22 +4248,12 @@ impl Timeline { } } - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - let layer_paths: Vec = new_layers - .iter() - .map(|l| l.local_path().to_owned()) - .collect(); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync_async(&layer_paths) - .await - .context("fsync all new layers")?; - + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. let timeline_dir = self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id); - par_fsync::par_fsync_async(&[timeline_dir]) .await .context("fsync of timeline dir")?;