diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index 1e8fa8d1d64e..f98356242e1c 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -1,12 +1,15 @@ use std::collections::BTreeSet; use itertools::Itertools; +use pageserver_compaction::helpers::overlaps_with; use super::storage_layer::LayerName; /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). /// -/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, +/// The function implements a fast path check and a slow path check. +/// +/// The fast path checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, /// /// ```plain /// | | | | @@ -25,31 +28,47 @@ use super::storage_layer::LayerName; /// | | | 4 | | | /// /// If layer 2 and 4 contain the same single key, this is also a valid layer map. +/// +/// However, if a partial compaction is still going on, it is possible that we get a layer map not satisfying the above condition. +/// Therefore, we fallback to simply check if any of the two delta layers overlap. (See "A slow path...") pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) let mut all_delta_layers = Vec::new(); for name in metadata { if let LayerName::Delta(layer) = name { - if layer.key_range.start.next() != layer.key_range.end { - all_delta_layers.push(layer.clone()); - } + all_delta_layers.push(layer.clone()); } } for layer in &all_delta_layers { - let lsn_range = &layer.lsn_range; - lsn_split_point.insert(lsn_range.start); - lsn_split_point.insert(lsn_range.end); + if layer.key_range.start.next() != layer.key_range.end { + let lsn_range = &layer.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } } - for layer in &all_delta_layers { + for (idx, layer) in all_delta_layers.iter().enumerate() { + if layer.key_range.start.next() == layer.key_range.end { + continue; + } let lsn_range = layer.lsn_range.clone(); let intersects = lsn_split_point.range(lsn_range).collect_vec(); if intersects.len() > 1 { - let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", - layer, - intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") - ); - return Some(err); + // A slow path to check if the layer intersects with any other delta layer. + for (other_idx, other_layer) in all_delta_layers.iter().enumerate() { + if other_idx == idx { + // do not check self intersects with self + continue; + } + if overlaps_with(&layer.lsn_range, &other_layer.lsn_range) + && overlaps_with(&layer.key_range, &other_layer.key_range) + { + let err = format!( + "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", + layer, other_layer + ); + return Some(err); + } + } } } None diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a4e8f3952265..01f2a5b170d2 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,6 +29,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; use crate::statvfs::Statvfs; +use crate::tenant::checks::check_valid_layermap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, @@ -2156,15 +2157,14 @@ impl Timeline { // Step 1: construct a k-merge iterator over all layers. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. - // disable the check for now because we need to adjust the check for partial compactions, will enable later. - // let layer_names = job_desc - // .selected_layers - // .iter() - // .map(|layer| layer.layer_desc().layer_name()) - // .collect_vec(); - // if let Some(err) = check_valid_layermap(&layer_names) { - // warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err); - // } + let layer_names = job_desc + .selected_layers + .iter() + .map(|layer| layer.layer_desc().layer_name()) + .collect_vec(); + if let Some(err) = check_valid_layermap(&layer_names) { + bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err); + } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc .selected_layers @@ -2546,8 +2546,36 @@ impl Timeline { ); // Step 3: Place back to the layer map. + + // First, do a sanity check to ensure the newly-created layer map does not contain overlaps. + let all_layers = { + let guard = self.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; + + let mut final_layers = all_layers + .iter() + .map(|layer| layer.layer_name()) + .collect::>(); + for layer in &layer_selection { + final_layers.remove(&layer.layer_desc().layer_name()); + } + for layer in &compact_to { + final_layers.insert(layer.layer_desc().layer_name()); + } + let final_layers = final_layers.into_iter().collect_vec(); + + // TODO: move this check before we call `finish` on image layer writers. However, this will require us to get the layer name before we finish + // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are + // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. + if let Some(err) = check_valid_layermap(&final_layers) { + bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err); + } + + // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only + // operate on L1 layers. { - // TODO: sanity check if the layer map is valid (i.e., should not have overlaps) let mut guard = self.layers.write().await; guard .open_mut()?