From f3a3eefd26284776ab3179116374009ec537ab11 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:29:53 -0400 Subject: [PATCH] feat(pageserver): do space check before gc-compaction (#9250) part of https://github.com/neondatabase/neon/issues/9114 ## Summary of changes gc-compaction may take a lot of disk space, and if it does, the caller should do a partial gc-compaction. This patch adds space check for the compaction job. --------- Signed-off-by: Alex Chi Z --- pageserver/src/disk_usage_eviction_task.rs | 11 +---- pageserver/src/statvfs.rs | 16 ++++++++ pageserver/src/tenant/storage_layer/layer.rs | 4 ++ pageserver/src/tenant/timeline/compaction.rs | 42 ++++++++++++++++++++ 4 files changed, 63 insertions(+), 10 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index a58fa2c0b1e1..7ab2ba87420f 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -1218,16 +1218,7 @@ mod filesystem_level_usage { let stat = Statvfs::get(tenants_dir, mock_config) .context("statvfs failed, presumably directory got unlinked")?; - // https://unix.stackexchange.com/a/703650 - let blocksize = if stat.fragment_size() > 0 { - stat.fragment_size() - } else { - stat.block_size() - }; - - // use blocks_available (b_avail) since, pageserver runs as unprivileged user - let avail_bytes = stat.blocks_available() * blocksize; - let total_bytes = stat.blocks() * blocksize; + let (avail_bytes, total_bytes) = stat.get_avail_total_bytes(); Ok(Usage { config, diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index 5a6f6e5176ac..205605bc86a8 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -53,6 +53,22 @@ impl Statvfs { Statvfs::Mock(stat) => stat.block_size, } } + + /// Get the available and total bytes on the filesystem. + pub fn get_avail_total_bytes(&self) -> (u64, u64) { + // https://unix.stackexchange.com/a/703650 + let blocksize = if self.fragment_size() > 0 { + self.fragment_size() + } else { + self.block_size() + }; + + // use blocks_available (b_avail) since, pageserver runs as unprivileged user + let avail_bytes = self.blocks_available() * blocksize; + let total_bytes = self.blocks() * blocksize; + + (avail_bytes, total_bytes) + } } pub mod mock { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index bbb21b180ed5..f29a33bae6de 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -341,6 +341,10 @@ impl Layer { Ok(()) } + pub(crate) async fn needs_download(&self) -> Result, std::io::Error> { + self.0.needs_download().await + } + /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8b9ace1e5bbf..5588363330d0 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,6 +29,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::filter_iterator::FilterIterator; @@ -1691,6 +1692,45 @@ impl Timeline { unreachable!("key retention is empty") } + /// Check how much space is left on the disk + async fn check_available_space(self: &Arc) -> anyhow::Result { + let tenants_dir = self.conf.tenants_path(); + + let stat = Statvfs::get(&tenants_dir, None) + .context("statvfs failed, presumably directory got unlinked")?; + + let (avail_bytes, _) = stat.get_avail_total_bytes(); + + Ok(avail_bytes) + } + + /// Check if the compaction can proceed safely without running out of space. We assume the size + /// upper bound of the produced files of a compaction job is the same as all layers involved in + /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a + /// compaction. + async fn check_compaction_space( + self: &Arc, + layer_selection: &[Layer], + ) -> anyhow::Result<()> { + let available_space = self.check_available_space().await?; + let mut remote_layer_size = 0; + let mut all_layer_size = 0; + for layer in layer_selection { + let needs_download = layer.needs_download().await?; + if needs_download.is_some() { + remote_layer_size += layer.layer_desc().file_size; + } + all_layer_size += layer.layer_desc().file_size; + } + let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */ + if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space + { + return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}", + available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size)); + } + Ok(()) + } + /// An experimental compaction building block that combines compaction with garbage collection. /// /// The current implementation picks all delta + image layers that are below or intersecting with @@ -1806,6 +1846,8 @@ impl Timeline { lowest_retain_lsn ); + self.check_compaction_space(&layer_selection).await?; + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)