diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 105c8a50d3d2..24474d48405e 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -104,8 +104,7 @@ pub struct ConfigToml { pub image_compression: ImageCompressionAlgorithm, pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, - pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode, - pub io_buffer_alignment: usize, + pub virtual_file_io_mode: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -388,10 +387,7 @@ impl Default for ConfigToml { image_compression: (DEFAULT_IMAGE_COMPRESSION), ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, - virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(), - - io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT, - + virtual_file_io_mode: None, tenant_config: TenantConfigToml::default(), } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 45abda0ad85d..3ec9cac2c321 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -972,8 +972,6 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { - use std::path::PathBuf; - #[derive( Copy, Clone, @@ -994,50 +992,45 @@ pub mod virtual_file { } /// Direct IO modes for a pageserver. - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] - #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] - pub enum DirectIoMode { - /// Direct IO disabled (uses usual buffered IO). - #[default] - Disabled, - /// Direct IO disabled (performs checks and perf simulations). - Evaluate { - /// Alignment check level - alignment_check: DirectIoAlignmentCheckLevel, - /// Latency padded for performance simulation. - latency_padding: DirectIoLatencyPadding, - }, - /// Direct IO enabled. - Enabled { - /// Actions to perform on alignment error. - on_alignment_error: DirectIoOnAlignmentErrorAction, - }, + #[derive( + Copy, + Clone, + PartialEq, + Eq, + Hash, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, + Debug, + )] + #[strum(serialize_all = "kebab-case")] + #[repr(u8)] + pub enum IoMode { + /// Uses buffered IO. + Buffered, + /// Uses direct IO, error out if the operation fails. + #[cfg(target_os = "linux")] + Direct, } - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] - #[serde(rename_all = "kebab-case")] - pub enum DirectIoAlignmentCheckLevel { - #[default] - Error, - Log, - None, + impl IoMode { + pub const fn preferred() -> Self { + Self::Buffered + } } - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] - #[serde(rename_all = "kebab-case")] - pub enum DirectIoOnAlignmentErrorAction { - Error, - #[default] - FallbackToBuffered, - } + impl TryFrom for IoMode { + type Error = u8; - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] - #[serde(tag = "type", rename_all = "kebab-case")] - pub enum DirectIoLatencyPadding { - /// Pad virtual file operations with IO to a fake file. - FakeFileRW { path: PathBuf }, - #[default] - None, + fn try_from(value: u8) -> Result { + Ok(match value { + v if v == (IoMode::Buffered as u8) => IoMode::Buffered, + #[cfg(target_os = "linux")] + v if v == (IoMode::Direct as u8) => IoMode::Direct, + x => return Err(x), + }) + } } } diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 72cbb6beabe0..821c8008a950 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -164,11 +164,7 @@ fn criterion_benchmark(c: &mut Criterion) { let conf: &'static PageServerConf = Box::leak(Box::new( pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), )); - virtual_file::init( - 16384, - virtual_file::io_engine_for_bench(), - pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - ); + virtual_file::init(16384, virtual_file::io_engine_for_bench()); page_cache::init(conf.page_cache_size); { diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 592f1ded0d0b..4d76c66905c4 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -540,10 +540,13 @@ impl Client { .map_err(Error::ReceiveBody) } - /// Configs io buffer alignment at runtime. - pub async fn put_io_alignment(&self, align: usize) -> Result<()> { - let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint); - self.request(Method::PUT, uri, align) + /// Configs io mode at runtime. + pub async fn put_io_mode( + &self, + mode: &pageserver_api::models::virtual_file::IoMode, + ) -> Result<()> { + let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint); + self.request(Method::PUT, uri, mode) .await? .json() .await diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index adc090823d84..151b94cf62d3 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -152,11 +152,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init( - 10, - virtual_file::api::IoEngineKind::StdFs, - pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - ); + pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index dd753398e2fa..fd948bf2efed 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,7 +59,7 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); @@ -190,11 +190,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init( - 10, - virtual_file::api::IoEngineKind::StdFs, - pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - ); + pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index cf001ef0d5d4..c96664d346cb 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -26,7 +26,7 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; -use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId}; +use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; @@ -205,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init( - 10, - virtual_file::api::IoEngineKind::StdFs, - DEFAULT_IO_BUFFER_ALIGNMENT, - ); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index ac4a732377b1..b2df01714d31 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -59,9 +59,9 @@ pub(crate) struct Args { #[clap(long)] set_io_engine: Option, - /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers. + /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct). #[clap(long)] - set_io_alignment: Option, + set_io_mode: Option, targets: Option>, } @@ -129,8 +129,8 @@ async fn main_impl( mgmt_api_client.put_io_engine(engine_str).await?; } - if let Some(align) = args.set_io_alignment { - mgmt_api_client.put_io_alignment(align).await?; + if let Some(mode) = &args.set_io_mode { + mgmt_api_client.put_io_mode(mode).await?; } // discover targets diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 593ca6db2d5a..f71a3d26531c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -125,8 +125,7 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); - info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); - info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment"); + info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. @@ -168,11 +167,7 @@ fn main() -> anyhow::Result<()> { let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init( - conf.max_file_descriptors, - conf.virtual_file_io_engine, - conf.io_buffer_alignment, - ); + virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index e15f1c791b78..8db78285e476 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -174,9 +174,7 @@ pub struct PageServerConf { pub l0_flush: crate::l0_flush::L0FlushConfig, /// Direct IO settings - pub virtual_file_direct_io: virtual_file::DirectIoMode, - - pub io_buffer_alignment: usize, + pub virtual_file_io_mode: virtual_file::IoMode, } /// Token for authentication to safekeepers @@ -325,11 +323,10 @@ impl PageServerConf { image_compression, ephemeral_bytes_per_memory_kb, l0_flush, - virtual_file_direct_io, + virtual_file_io_mode, concurrent_tenant_warmup, concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, - io_buffer_alignment, tenant_config, } = config_toml; @@ -368,8 +365,6 @@ impl PageServerConf { max_vectored_read_bytes, image_compression, ephemeral_bytes_per_memory_kb, - virtual_file_direct_io, - io_buffer_alignment, // ------------------------------------------------------------ // fields that require additional validation or custom handling @@ -408,6 +403,7 @@ impl PageServerConf { l0_flush: l0_flush .map(crate::l0_flush::L0FlushConfig::from) .unwrap_or_default(), + virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), }; // ------------------------------------------------------------ diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5fc8272074b7..2985ab1efb68 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -17,6 +17,7 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::virtual_file::IoMode; use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use pageserver_api::models::IngestAuxFilesRequest; @@ -2381,17 +2382,13 @@ async fn put_io_engine_handler( json_response(StatusCode::OK, ()) } -async fn put_io_alignment_handler( +async fn put_io_mode_handler( mut r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&r, None)?; - let align: usize = json_request(&mut r).await?; - crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| { - ApiError::PreconditionFailed( - format!("Requested io alignment ({align}) is not a power of two").into(), - ) - })?; + let mode: IoMode = json_request(&mut r).await?; + crate::virtual_file::set_io_mode(mode); json_response(StatusCode::OK, ()) } @@ -3082,9 +3079,7 @@ pub fn make_router( |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) - .put("/v1/io_alignment", |r| { - api_handler(r, put_io_alignment_handler) - }) + .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler)) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", |r| api_handler(r, force_aux_policy_switch_handler), diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 5324e1807d9c..a62a47f9a760 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -84,7 +84,7 @@ impl Drop for EphemeralFile { fn drop(&mut self) { // unlink the file // we are clear to do this, because we have entered a gate - let path = &self.buffered_writer.as_inner().as_inner().path; + let path = self.buffered_writer.as_inner().as_inner().path(); let res = std::fs::remove_file(path); if let Err(e) = res { if e.kind() != std::io::ErrorKind::NotFound { @@ -356,7 +356,7 @@ mod tests { } let file_contents = - std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap(); + std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap(); assert_eq!(file_contents, &content[0..cap]); let buffer_contents = file.buffered_writer.inspect_buffer(); @@ -392,7 +392,7 @@ mod tests { .buffered_writer .as_inner() .as_inner() - .path + .path() .metadata() .unwrap(); assert_eq!( diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 2acad666b8d0..8be7d7876f82 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -573,7 +573,7 @@ impl DeltaLayerWriterInner { ensure!( metadata.len() <= S3_UPLOAD_LIMIT, "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!", - file.path, + file.path(), metadata.len() ); @@ -791,7 +791,7 @@ impl DeltaLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open(path, ctx) + let file = VirtualFile::open_v2(path, ctx) .await .context("open layer file")?; @@ -1022,7 +1022,7 @@ impl DeltaLayerInner { blob_meta.key, PageReconstructError::Other(anyhow!( "Failed to read blobs from virtual file {}: {}", - self.file.path, + self.file.path(), kind )), ); @@ -1048,7 +1048,7 @@ impl DeltaLayerInner { meta.meta.key, PageReconstructError::Other(anyhow!(e).context(format!( "Failed to decompress blob from virtual file {}", - self.file.path, + self.file.path(), ))), ); @@ -1066,7 +1066,7 @@ impl DeltaLayerInner { meta.meta.key, PageReconstructError::Other(anyhow!(e).context(format!( "Failed to deserialize blob from virtual file {}", - self.file.path, + self.file.path(), ))), ); @@ -1198,7 +1198,6 @@ impl DeltaLayerInner { let mut prev: Option<(Key, Lsn, BlobRef)> = None; let mut read_builder: Option = None; - let align = virtual_file::get_io_buffer_alignment(); let max_read_size = self .max_vectored_read_bytes @@ -1247,7 +1246,6 @@ impl DeltaLayerInner { offsets.end.pos(), meta, max_read_size, - align, )) } } else { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 9b53fa9e18cd..de8155f455d1 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -389,7 +389,7 @@ impl ImageLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open(path, ctx) + let file = VirtualFile::open_v2(path, ctx) .await .context("open layer file")?; let file_id = page_cache::next_file_id(); @@ -626,7 +626,7 @@ impl ImageLayerInner { meta.meta.key, PageReconstructError::Other(anyhow!(e).context(format!( "Failed to decompress blob from virtual file {}", - self.file.path, + self.file.path(), ))), ); @@ -647,7 +647,7 @@ impl ImageLayerInner { blob_meta.key, PageReconstructError::from(anyhow!( "Failed to read blobs from virtual file {}: {}", - self.file.path, + self.file.path(), kind )), ); diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 1faa6bab99e0..792c769b4fc0 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -194,8 +194,6 @@ pub(crate) struct ChunkedVectoredReadBuilder { /// Start offset and metadata for each blob in this read blobs_at: VecMap, max_read_size: Option, - /// Chunk size reads are coalesced into. - chunk_size: usize, } /// Computes x / d rounded up. @@ -204,6 +202,7 @@ fn div_round_up(x: usize, d: usize) -> usize { } impl ChunkedVectoredReadBuilder { + const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment(); /// Start building a new vectored read. /// /// Note that by design, this does not check against reading more than `max_read_size` to @@ -214,21 +213,19 @@ impl ChunkedVectoredReadBuilder { end_offset: u64, meta: BlobMeta, max_read_size: Option, - chunk_size: usize, ) -> Self { let mut blobs_at = VecMap::default(); blobs_at .append(start_offset, meta) .expect("First insertion always succeeds"); - let start_blk_no = start_offset as usize / chunk_size; - let end_blk_no = div_round_up(end_offset as usize, chunk_size); + let start_blk_no = start_offset as usize / Self::CHUNK_SIZE; + let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE); Self { start_blk_no, end_blk_no, blobs_at, max_read_size, - chunk_size, } } @@ -237,18 +234,12 @@ impl ChunkedVectoredReadBuilder { end_offset: u64, meta: BlobMeta, max_read_size: usize, - align: usize, ) -> Self { - Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align) + Self::new_impl(start_offset, end_offset, meta, Some(max_read_size)) } - pub(crate) fn new_streaming( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - align: usize, - ) -> Self { - Self::new_impl(start_offset, end_offset, meta, None, align) + pub(crate) fn new_streaming(start_offset: u64, end_offset: u64, meta: BlobMeta) -> Self { + Self::new_impl(start_offset, end_offset, meta, None) } /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk. @@ -256,12 +247,12 @@ impl ChunkedVectoredReadBuilder { /// The resulting size also must be below the max read size. pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { tracing::trace!(start, end, "trying to extend"); - let start_blk_no = start as usize / self.chunk_size; - let end_blk_no = div_round_up(end as usize, self.chunk_size); + let start_blk_no = start as usize / Self::CHUNK_SIZE; + let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE); let not_limited_by_max_read_size = { if let Some(max_read_size) = self.max_read_size { - let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size; + let coalesced_size = (end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE; coalesced_size <= max_read_size } else { true @@ -292,12 +283,12 @@ impl ChunkedVectoredReadBuilder { } pub(crate) fn size(&self) -> usize { - (self.end_blk_no - self.start_blk_no) * self.chunk_size + (self.end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE } pub(crate) fn build(self) -> VectoredRead { - let start = (self.start_blk_no * self.chunk_size) as u64; - let end = (self.end_blk_no * self.chunk_size) as u64; + let start = (self.start_blk_no * Self::CHUNK_SIZE) as u64; + let end = (self.end_blk_no * Self::CHUNK_SIZE) as u64; VectoredRead { start, end, @@ -328,18 +319,14 @@ pub struct VectoredReadPlanner { prev: Option<(Key, Lsn, u64, BlobFlag)>, max_read_size: usize, - - align: usize, } impl VectoredReadPlanner { pub fn new(max_read_size: usize) -> Self { - let align = virtual_file::get_io_buffer_alignment(); Self { blobs: BTreeMap::new(), prev: None, max_read_size, - align, } } @@ -418,7 +405,6 @@ impl VectoredReadPlanner { end_offset, BlobMeta { key, lsn }, self.max_read_size, - self.align, ); let prev_read_builder = current_read_builder.replace(next_read_builder); @@ -472,13 +458,13 @@ impl<'a> VectoredBlobReader<'a> { ); if cfg!(debug_assertions) { - let align = virtual_file::get_io_buffer_alignment() as u64; + const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; debug_assert_eq!( - read.start % align, + read.start % ALIGN, 0, "Read start at {} does not satisfy the required io buffer alignment ({} bytes)", read.start, - align + ALIGN ); } @@ -553,22 +539,18 @@ pub struct StreamingVectoredReadPlanner { max_cnt: usize, /// Size of the current batch cnt: usize, - - align: usize, } impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); assert!(max_read_size > 0); - let align = virtual_file::get_io_buffer_alignment(); Self { read_builder: None, prev: None, max_cnt, max_read_size, cnt: 0, - align, } } @@ -621,7 +603,6 @@ impl StreamingVectoredReadPlanner { start_offset, end_offset, BlobMeta { key, lsn }, - self.align, )) }; } @@ -656,9 +637,9 @@ mod tests { use super::*; fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { - let align = virtual_file::get_io_buffer_alignment() as u64; - assert_eq!(read.start % align, 0); - assert_eq!(read.start / align, offset_range.first().unwrap().2 / align); + const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; + assert_eq!(read.start % ALIGN, 0); + assert_eq!(read.start / ALIGN, offset_range.first().unwrap().2 / ALIGN); let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); @@ -676,32 +657,27 @@ mod tests { fn planner_chunked_coalesce_all_test() { use crate::virtual_file; - let chunk_size = virtual_file::get_io_buffer_alignment() as u64; - - // The test explicitly does not check chunk size < 512 - if chunk_size < 512 { - return; - } + const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64; - let max_read_size = chunk_size as usize * 8; + let max_read_size = CHUNK_SIZE as usize * 8; let key = Key::MIN; let lsn = Lsn(0); let blob_descriptions = [ - (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN - (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap - (key, lsn, chunk_size / 2, BlobFlag::None), - (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap - (key, lsn, chunk_size, BlobFlag::None), - (key, lsn, chunk_size * 2 - 1, BlobFlag::None), - (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap - (key, lsn, chunk_size * 3 + 1, BlobFlag::None), - (key, lsn, chunk_size * 5 + 1, BlobFlag::None), - (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce. - (key, lsn, chunk_size * 7 + 1, BlobFlag::None), - (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size) - (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk - (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce) + (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN + (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap + (key, lsn, CHUNK_SIZE / 2, BlobFlag::None), + (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap + (key, lsn, CHUNK_SIZE, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap + (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce. + (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size) + (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk + (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce) ]; let ranges = [ @@ -780,19 +756,19 @@ mod tests { #[test] fn planner_replacement_test() { - let chunk_size = virtual_file::get_io_buffer_alignment() as u64; - let max_read_size = 128 * chunk_size as usize; + const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64; + let max_read_size = 128 * CHUNK_SIZE as usize; let first_key = Key::MIN; let second_key = first_key.next(); let lsn = Lsn(0); let blob_descriptions = vec![ (first_key, lsn, 0, BlobFlag::None), // First in read 1 - (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1 - (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll), - (second_key, lsn, 3 * chunk_size, BlobFlag::None), - (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2 - (second_key, lsn, 5 * chunk_size, BlobFlag::None), // Last in read 2 + (first_key, lsn, CHUNK_SIZE, BlobFlag::None), // Last in read 1 + (second_key, lsn, 2 * CHUNK_SIZE, BlobFlag::ReplaceAll), + (second_key, lsn, 3 * CHUNK_SIZE, BlobFlag::None), + (second_key, lsn, 4 * CHUNK_SIZE, BlobFlag::ReplaceAll), // First in read 2 + (second_key, lsn, 5 * CHUNK_SIZE, BlobFlag::None), // Last in read 2 ]; let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; @@ -802,7 +778,7 @@ mod tests { planner.handle(key, lsn, offset, flag); } - planner.handle_range_end(6 * chunk_size); + planner.handle_range_end(6 * CHUNK_SIZE); let reads = planner.finish(); assert_eq!(reads.len(), 2); @@ -947,7 +923,6 @@ mod tests { let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; let mut buf = BytesMut::with_capacity(reserved_bytes); - let align = virtual_file::get_io_buffer_alignment(); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { key: Key::MIN, @@ -959,8 +934,7 @@ mod tests { if idx + 1 == offsets.len() { continue; } - let read_builder = - ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align); + let read_builder = ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096); let read = read_builder.build(); let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; assert_eq!(result.blobs.len(), 1); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 5b7b2798888f..d260116b3892 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -23,10 +23,12 @@ use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; +#[cfg(target_os = "linux")] +use std::os::unix::fs::OpenOptionsExt; use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; @@ -38,7 +40,7 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; mod metadata; mod open_options; use self::owned_buffers_io::write::OwnedAsyncWriter; -pub(crate) use api::DirectIoMode; +pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; @@ -61,6 +63,171 @@ pub(crate) mod owned_buffers_io { } } +#[derive(Debug)] +pub struct VirtualFile { + inner: VirtualFileInner, + _mode: IoMode, +} + +impl VirtualFile { + /// Open a file in read-only mode. Like File::open. + pub async fn open>( + path: P, + ctx: &RequestContext, + ) -> Result { + let inner = VirtualFileInner::open(path, ctx).await?; + Ok(VirtualFile { + inner, + _mode: IoMode::Buffered, + }) + } + + /// Open a file in read-only mode. Like File::open. + /// + /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`. + pub async fn open_v2>( + path: P, + ctx: &RequestContext, + ) -> Result { + Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await + } + + pub async fn create>( + path: P, + ctx: &RequestContext, + ) -> Result { + let inner = VirtualFileInner::create(path, ctx).await?; + Ok(VirtualFile { + inner, + _mode: IoMode::Buffered, + }) + } + + pub async fn create_v2>( + path: P, + ctx: &RequestContext, + ) -> Result { + VirtualFile::open_with_options_v2( + path.as_ref(), + OpenOptions::new().write(true).create(true).truncate(true), + ctx, + ) + .await + } + + pub async fn open_with_options>( + path: P, + open_options: &OpenOptions, + ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + ) -> Result { + let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; + Ok(VirtualFile { + inner, + _mode: IoMode::Buffered, + }) + } + + pub async fn open_with_options_v2>( + path: P, + open_options: &OpenOptions, + ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + ) -> Result { + let file = match get_io_mode() { + IoMode::Buffered => { + let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; + VirtualFile { + inner, + _mode: IoMode::Buffered, + } + } + #[cfg(target_os = "linux")] + IoMode::Direct => { + let inner = VirtualFileInner::open_with_options( + path, + open_options.clone().custom_flags(nix::libc::O_DIRECT), + ctx, + ) + .await?; + VirtualFile { + inner, + _mode: IoMode::Direct, + } + } + }; + Ok(file) + } + + pub fn path(&self) -> &Utf8Path { + self.inner.path.as_path() + } + + pub async fn crashsafe_overwrite + Send, Buf: IoBuf + Send>( + final_path: Utf8PathBuf, + tmp_path: Utf8PathBuf, + content: B, + ) -> std::io::Result<()> { + VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await + } + + pub async fn sync_all(&self) -> Result<(), Error> { + self.inner.sync_all().await + } + + pub async fn sync_data(&self) -> Result<(), Error> { + self.inner.sync_data().await + } + + pub async fn metadata(&self) -> Result { + self.inner.metadata().await + } + + pub fn remove(self) { + self.inner.remove(); + } + + pub async fn seek(&mut self, pos: SeekFrom) -> Result { + self.inner.seek(pos).await + } + + pub async fn read_exact_at( + &self, + slice: Slice, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> + where + Buf: IoBufMut + Send, + { + self.inner.read_exact_at(slice, offset, ctx).await + } + + pub async fn read_exact_at_page( + &self, + page: PageWriteGuard<'static>, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> { + self.inner.read_exact_at_page(page, offset, ctx).await + } + + pub async fn write_all_at( + &self, + buf: FullSlice, + offset: u64, + ctx: &RequestContext, + ) -> (FullSlice, Result<(), Error>) { + self.inner.write_all_at(buf, offset, ctx).await + } + + pub async fn write_all( + &mut self, + buf: FullSlice, + ctx: &RequestContext, + ) -> (FullSlice, Result) { + self.inner.write_all(buf, ctx).await + } +} + /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally /// the underlying file is closed if the system is low on file descriptors, @@ -77,7 +244,7 @@ pub(crate) mod owned_buffers_io { /// 'tag' field is used to detect whether the handle still is valid or not. /// #[derive(Debug)] -pub struct VirtualFile { +pub struct VirtualFileInner { /// Lazy handle to the global file descriptor cache. The slot that this points to /// might contain our File, or it may be empty, or it may contain a File that /// belongs to a different VirtualFile. @@ -350,12 +517,12 @@ macro_rules! with_file { }}; } -impl VirtualFile { +impl VirtualFileInner { /// Open a file in read-only mode. Like File::open. pub async fn open>( path: P, ctx: &RequestContext, - ) -> Result { + ) -> Result { Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } @@ -364,7 +531,7 @@ impl VirtualFile { pub async fn create>( path: P, ctx: &RequestContext, - ) -> Result { + ) -> Result { Self::open_with_options( path.as_ref(), OpenOptions::new().write(true).create(true).truncate(true), @@ -382,7 +549,7 @@ impl VirtualFile { path: P, open_options: &OpenOptions, _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ - ) -> Result { + ) -> Result { let path_ref = path.as_ref(); let path_str = path_ref.to_string(); let parts = path_str.split('/').collect::>(); @@ -423,7 +590,7 @@ impl VirtualFile { reopen_options.create_new(false); reopen_options.truncate(false); - let vfile = VirtualFile { + let vfile = VirtualFileInner { handle: RwLock::new(handle), pos: 0, path: path_ref.to_path_buf(), @@ -1034,6 +1201,21 @@ impl tokio_epoll_uring::IoFd for FileGuard { #[cfg(test)] impl VirtualFile { + pub(crate) async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result, std::io::Error> { + self.inner.read_blk(blknum, ctx).await + } + + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { + self.inner.read_to_end(buf, ctx).await + } +} + +#[cfg(test)] +impl VirtualFileInner { pub(crate) async fn read_blk( &self, blknum: u32, @@ -1067,7 +1249,7 @@ impl VirtualFile { } } -impl Drop for VirtualFile { +impl Drop for VirtualFileInner { /// If a VirtualFile is dropped, close the underlying file if it was open. fn drop(&mut self) { let handle = self.handle.get_mut(); @@ -1143,15 +1325,10 @@ impl OpenFiles { /// server startup. /// #[cfg(not(test))] -pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) { +pub fn init(num_slots: usize, engine: IoEngineKind) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } - if set_io_buffer_alignment(io_buffer_alignment).is_err() { - panic!( - "IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}" - ); - } io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -1175,47 +1352,20 @@ fn get_open_files() -> &'static OpenFiles { } } -static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT); - -/// Returns true if the alignment is a power of two and is greater or equal to 512. -fn is_valid_io_buffer_alignment(align: usize) -> bool { - align.is_power_of_two() && align >= 512 -} - -/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is -/// not a power of two or less than 512 bytes. -#[allow(unused)] -pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { - if is_valid_io_buffer_alignment(align) { - IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed); - Ok(()) - } else { - Err(align) - } +/// Gets the io buffer alignment. +pub(crate) const fn get_io_buffer_alignment() -> usize { + DEFAULT_IO_BUFFER_ALIGNMENT } -/// Gets the io buffer alignment. -/// -/// This function should be used for getting the actual alignment value to use. -pub(crate) fn get_io_buffer_alignment() -> usize { - let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed); +static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); - if cfg!(test) { - let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT"; - if let Some(test_align) = utils::env::var(env_var_name) { - if is_valid_io_buffer_alignment(test_align) { - test_align - } else { - panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}"); - } - } else { - align - } - } else { - align - } +pub(crate) fn set_io_mode(mode: IoMode) { + IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed); } +pub(crate) fn get_io_mode() -> IoMode { + IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap() +} #[cfg(test)] mod tests { use crate::context::DownloadBehavior; @@ -1524,7 +1674,7 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFile::open_with_options( + let f = VirtualFileInner::open_with_options( &test_file_path, OpenOptions::new().read(true), &ctx, @@ -1576,7 +1726,7 @@ mod tests { let path = testdir.join("myfile"); let tmp_path = testdir.join("myfile.tmp"); - VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) + VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); @@ -1585,7 +1735,7 @@ mod tests { assert!(!tmp_path.exists()); drop(file); - VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) + VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); @@ -1608,7 +1758,7 @@ mod tests { std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap(); assert!(tmp_path.exists()); - VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) + VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9a51899d5d1f..e6de72752fea 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -395,7 +395,7 @@ def __init__( pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None, safekeeper_extra_opts: Optional[list[str]] = None, storage_controller_port_override: Optional[int] = None, - pageserver_io_buffer_alignment: Optional[int] = None, + pageserver_virtual_file_io_mode: Optional[str] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -449,7 +449,7 @@ def __init__( self.storage_controller_port_override = storage_controller_port_override - self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment + self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode assert test_name.startswith( "test_" @@ -1038,7 +1038,7 @@ def __init__(self, config: NeonEnvBuilder): self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy - self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment + self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode # Create the neon_local's `NeonLocalInitConf` cfg: dict[str, Any] = { @@ -1102,7 +1102,8 @@ def __init__(self, config: NeonEnvBuilder): for key, value in override.items(): ps_cfg[key] = value - ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment + if self.pageserver_virtual_file_io_mode is not None: + ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode # Create a corresponding NeonPageserver object self.pageservers.append( @@ -1407,7 +1408,7 @@ def neon_simple_env( pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore], pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], - pageserver_io_buffer_alignment: Optional[int], + pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnv]: """ Simple Neon environment, with no authentication and no safekeepers. @@ -1433,7 +1434,7 @@ def neon_simple_env( pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, - pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, + pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, ) as builder: env = builder.init_start() @@ -1457,7 +1458,7 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], pageserver_aux_file_policy: Optional[AuxFileStore], record_property: Callable[[str, object], None], - pageserver_io_buffer_alignment: Optional[int], + pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1492,7 +1493,7 @@ def neon_env_builder( test_overlay_dir=test_overlay_dir, pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, - pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, + pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, ) as builder: yield builder # Propogate `preserve_database_files` to make it possible to use in other fixtures, diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index b408d83cf353..3bbac4b8eeb4 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -41,8 +41,8 @@ def pageserver_virtual_file_io_engine() -> Optional[str]: @pytest.fixture(scope="function", autouse=True) -def pageserver_io_buffer_alignment() -> Optional[int]: - return None +def pageserver_virtual_file_io_mode() -> Optional[str]: + return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") @pytest.fixture(scope="function", autouse=True)