From a85bd888661823133d7864bb667e88a906936bea Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 1 Oct 2024 02:48:26 +0000 Subject: [PATCH 01/24] pageserver: add direct io config to virtual file Signed-off-by: Yuchen Liang --- libs/pageserver_api/src/config.rs | 4 +- libs/pageserver_api/src/models.rs | 64 ++-- pageserver/client/src/mgmt_api.rs | 10 + pageserver/src/bin/pageserver.rs | 2 +- pageserver/src/config.rs | 6 +- pageserver/src/http/routes.rs | 12 + pageserver/src/tenant/ephemeral_file.rs | 6 +- .../src/tenant/storage_layer/delta_layer.rs | 10 +- .../src/tenant/storage_layer/image_layer.rs | 6 +- pageserver/src/virtual_file.rs | 278 +++++++++++++++++- test_runner/fixtures/neon_fixtures.py | 12 +- test_runner/fixtures/parametrize.py | 5 + 12 files changed, 341 insertions(+), 74 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 95310fdbac2b..4492730a84c9 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -104,7 +104,7 @@ pub struct ConfigToml { pub image_compression: ImageCompressionAlgorithm, pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, - pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode, + pub virtual_file_io_mode: Option, pub io_buffer_alignment: usize, } @@ -381,7 +381,7 @@ impl Default for ConfigToml { image_compression: (DEFAULT_IMAGE_COMPRESSION), ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, - virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(), + virtual_file_io_mode: None, io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 45abda0ad85d..67da2783f5ca 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -972,8 +972,6 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { - use std::path::PathBuf; - #[derive( Copy, Clone, @@ -994,50 +992,34 @@ pub mod virtual_file { } /// Direct IO modes for a pageserver. - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] - #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] - pub enum DirectIoMode { - /// Direct IO disabled (uses usual buffered IO). - #[default] - Disabled, - /// Direct IO disabled (performs checks and perf simulations). - Evaluate { - /// Alignment check level - alignment_check: DirectIoAlignmentCheckLevel, - /// Latency padded for performance simulation. - latency_padding: DirectIoLatencyPadding, - }, - /// Direct IO enabled. - Enabled { - /// Actions to perform on alignment error. - on_alignment_error: DirectIoOnAlignmentErrorAction, - }, + #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] + #[serde(rename_all = "kebab-case", deny_unknown_fields)] + #[repr(u8)] + pub enum IoMode { + /// Uses buffered IO. + Buffered, + /// Uses direct IO, error out if the operation fails. + #[cfg(target_os = "linux")] + Direct, } - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] - #[serde(rename_all = "kebab-case")] - pub enum DirectIoAlignmentCheckLevel { - #[default] - Error, - Log, - None, + impl IoMode { + pub const fn preferred() -> Self { + Self::Buffered + } } - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] - #[serde(rename_all = "kebab-case")] - pub enum DirectIoOnAlignmentErrorAction { - Error, - #[default] - FallbackToBuffered, - } + impl TryFrom for IoMode { + type Error = u8; - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)] - #[serde(tag = "type", rename_all = "kebab-case")] - pub enum DirectIoLatencyPadding { - /// Pad virtual file operations with IO to a fake file. - FakeFileRW { path: PathBuf }, - #[default] - None, + fn try_from(value: u8) -> Result { + Ok(match value { + v if v == (IoMode::Buffered as u8) => IoMode::Buffered, + #[cfg(target_os = "linux")] + v if v == (IoMode::Direct as u8) => IoMode::Direct, + x => return Err(x), + }) + } } } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 592f1ded0d0b..4c54f4274b7c 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -550,6 +550,16 @@ impl Client { .map_err(Error::ReceiveBody) } + /// Configs io mode at runtime. + pub async fn put_io_mode(&self, mode: virtual_file::IoMode) -> Result<()> { + let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint); + self.request(Method::PUT, uri, mode) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn get_utilization(&self) -> Result { let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); self.get(uri) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index e9e52acee67c..a491ade75150 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -125,7 +125,7 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); - info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); + info!(?conf.virtual_file_io_mode, "starting with virtual_file Direct IO settings"); info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment"); // The tenants directory contains all the pageserver local disk state. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index e15f1c791b78..6e0acd52e219 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -174,7 +174,7 @@ pub struct PageServerConf { pub l0_flush: crate::l0_flush::L0FlushConfig, /// Direct IO settings - pub virtual_file_direct_io: virtual_file::DirectIoMode, + pub virtual_file_io_mode: virtual_file::IoMode, pub io_buffer_alignment: usize, } @@ -325,7 +325,7 @@ impl PageServerConf { image_compression, ephemeral_bytes_per_memory_kb, l0_flush, - virtual_file_direct_io, + virtual_file_io_mode, concurrent_tenant_warmup, concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, @@ -368,7 +368,6 @@ impl PageServerConf { max_vectored_read_bytes, image_compression, ephemeral_bytes_per_memory_kb, - virtual_file_direct_io, io_buffer_alignment, // ------------------------------------------------------------ @@ -408,6 +407,7 @@ impl PageServerConf { l0_flush: l0_flush .map(crate::l0_flush::L0FlushConfig::from) .unwrap_or_default(), + virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()), }; // ------------------------------------------------------------ diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1cc5502bd602..22bd8cf9b96c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -17,6 +17,7 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::virtual_file::IoMode; use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use pageserver_api::models::IngestAuxFilesRequest; @@ -2386,6 +2387,16 @@ async fn put_io_alignment_handler( json_response(StatusCode::OK, ()) } +async fn put_io_mode_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let mode: IoMode = json_request(&mut r).await?; + crate::virtual_file::set_io_mode(mode); + json_response(StatusCode::OK, ()) +} + /// Polled by control plane. /// /// See [`crate::utilization`]. @@ -3076,6 +3087,7 @@ pub fn make_router( .put("/v1/io_alignment", |r| { api_handler(r, put_io_alignment_handler) }) + .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler)) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", |r| api_handler(r, force_aux_policy_switch_handler), diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 5324e1807d9c..a62a47f9a760 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -84,7 +84,7 @@ impl Drop for EphemeralFile { fn drop(&mut self) { // unlink the file // we are clear to do this, because we have entered a gate - let path = &self.buffered_writer.as_inner().as_inner().path; + let path = self.buffered_writer.as_inner().as_inner().path(); let res = std::fs::remove_file(path); if let Err(e) = res { if e.kind() != std::io::ErrorKind::NotFound { @@ -356,7 +356,7 @@ mod tests { } let file_contents = - std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap(); + std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap(); assert_eq!(file_contents, &content[0..cap]); let buffer_contents = file.buffered_writer.inspect_buffer(); @@ -392,7 +392,7 @@ mod tests { .buffered_writer .as_inner() .as_inner() - .path + .path() .metadata() .unwrap(); assert_eq!( diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 6f9eda85f57b..d18f86fd0e50 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -572,7 +572,7 @@ impl DeltaLayerWriterInner { ensure!( metadata.len() <= S3_UPLOAD_LIMIT, "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!", - file.path, + file.path(), metadata.len() ); @@ -790,7 +790,7 @@ impl DeltaLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open(path, ctx) + let file = VirtualFile::open_v2(path, ctx) .await .context("open layer file")?; @@ -1010,7 +1010,7 @@ impl DeltaLayerInner { blob_meta.key, PageReconstructError::Other(anyhow!( "Failed to read blobs from virtual file {}: {}", - self.file.path, + self.file.path(), kind )), ); @@ -1036,7 +1036,7 @@ impl DeltaLayerInner { meta.meta.key, PageReconstructError::Other(anyhow!(e).context(format!( "Failed to decompress blob from virtual file {}", - self.file.path, + self.file.path(), ))), ); @@ -1054,7 +1054,7 @@ impl DeltaLayerInner { meta.meta.key, PageReconstructError::Other(anyhow!(e).context(format!( "Failed to deserialize blob from virtual file {}", - self.file.path, + self.file.path(), ))), ); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 3dcd7bc96271..aedfdbf167fc 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -388,7 +388,7 @@ impl ImageLayerInner { max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> anyhow::Result { - let file = VirtualFile::open(path, ctx) + let file = VirtualFile::open_v2(path, ctx) .await .context("open layer file")?; let file_id = page_cache::next_file_id(); @@ -614,7 +614,7 @@ impl ImageLayerInner { meta.meta.key, PageReconstructError::Other(anyhow!(e).context(format!( "Failed to decompress blob from virtual file {}", - self.file.path, + self.file.path(), ))), ); @@ -635,7 +635,7 @@ impl ImageLayerInner { blob_meta.key, PageReconstructError::from(anyhow!( "Failed to read blobs from virtual file {}: {}", - self.file.path, + self.file.path(), kind )), ); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 5b7b2798888f..302ea04451cd 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -23,10 +23,12 @@ use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; +#[cfg(target_os = "linux")] +use std::os::unix::fs::OpenOptionsExt; use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering}; use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use tokio::time::Instant; @@ -38,7 +40,7 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; mod metadata; mod open_options; use self::owned_buffers_io::write::OwnedAsyncWriter; -pub(crate) use api::DirectIoMode; +pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; @@ -61,6 +63,227 @@ pub(crate) mod owned_buffers_io { } } +#[derive(Debug)] +pub enum VirtualFile { + Buffered(VirtualFileInner), + Direct(VirtualFileInner), +} + +impl VirtualFile { + fn inner(&self) -> &VirtualFileInner { + match self { + Self::Buffered(file) => file, + Self::Direct(file) => file, + } + } + + fn inner_mut(&mut self) -> &mut VirtualFileInner { + match self { + Self::Buffered(file) => file, + Self::Direct(file) => file, + } + } + + fn into_inner(self) -> VirtualFileInner { + match self { + Self::Buffered(file) => file, + Self::Direct(file) => file, + } + } + /// Open a file in read-only mode. Like File::open. + pub async fn open>( + path: P, + ctx: &RequestContext, + ) -> Result { + Self::open_buffered(path, ctx).await + } + + /// Open a file in read-only mode. Like File::open. + /// + /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`. + pub async fn open_v2>( + path: P, + ctx: &RequestContext, + ) -> Result { + match get_io_mode() { + IoMode::Buffered => Self::open_buffered(path, ctx).await, + #[cfg(target_os = "linux")] + IoMode::Direct => Self::open_direct(path, ctx).await, + } + } + + /// Open a file in read-only mode using direct IO. Like File::open. + async fn open_buffered>( + path: P, + ctx: &RequestContext, + ) -> Result { + let file = + VirtualFileInner::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx) + .await?; + Ok(Self::Buffered(file)) + } + + /// Open a file in read-only mode using direct IO. Like File::open with `O_DIRECT` flag. + #[cfg(target_os = "linux")] + async fn open_direct>( + path: P, + ctx: &RequestContext, + ) -> Result { + let file = VirtualFileInner::open_with_options( + path.as_ref(), + OpenOptions::new() + .read(true) + .custom_flags(nix::libc::O_DIRECT), + ctx, + ) + .await?; + Ok(Self::Direct(file)) + } + + pub async fn create>( + path: P, + ctx: &RequestContext, + ) -> Result { + let file = VirtualFileInner::create(path, ctx).await?; + Ok(Self::Buffered(file)) + } + + pub async fn create_v2>( + path: P, + ctx: &RequestContext, + ) -> Result { + let file = match get_io_mode() { + IoMode::Buffered => { + let file = VirtualFileInner::open_with_options( + path.as_ref(), + OpenOptions::new().write(true).create(true).truncate(true), + ctx, + ) + .await?; + Self::Buffered(file) + } + #[cfg(target_os = "linux")] + IoMode::Direct => { + let file = VirtualFileInner::open_with_options( + path.as_ref(), + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .custom_flags(nix::libc::O_DIRECT), + ctx, + ) + .await?; + Self::Direct(file) + } + }; + Ok(file) + } + + pub async fn open_with_options>( + path: P, + open_options: &OpenOptions, + ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + ) -> Result { + let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?; + Ok(Self::Buffered(file)) + } + + pub async fn open_with_options_v2>( + path: P, + open_options: &mut OpenOptions, + ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ + ) -> Result { + let file = match get_io_mode() { + IoMode::Buffered => { + let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?; + Self::Buffered(file) + } + #[cfg(target_os = "linux")] + IoMode::Direct => { + let file = VirtualFileInner::open_with_options( + path, + open_options.custom_flags(nix::libc::O_DIRECT), + ctx, + ) + .await?; + Self::Direct(file) + } + }; + Ok(file) + } + + pub fn path(&self) -> &Utf8Path { + self.inner().path.as_path() + } + + pub async fn crashsafe_overwrite + Send, Buf: IoBuf + Send>( + final_path: Utf8PathBuf, + tmp_path: Utf8PathBuf, + content: B, + ) -> std::io::Result<()> { + VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await + } + + pub async fn sync_all(&self) -> Result<(), Error> { + self.inner().sync_all().await + } + + pub async fn sync_data(&self) -> Result<(), Error> { + self.inner().sync_data().await + } + + pub async fn metadata(&self) -> Result { + self.inner().metadata().await + } + + pub fn remove(self) { + self.into_inner().remove(); + } + + pub async fn seek(&mut self, pos: SeekFrom) -> Result { + self.inner_mut().seek(pos).await + } + + pub async fn read_exact_at( + &self, + slice: Slice, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> + where + Buf: IoBufMut + Send, + { + self.inner().read_exact_at(slice, offset, ctx).await + } + + pub async fn read_exact_at_page( + &self, + page: PageWriteGuard<'static>, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> { + self.inner().read_exact_at_page(page, offset, ctx).await + } + + pub async fn write_all_at( + &self, + buf: FullSlice, + offset: u64, + ctx: &RequestContext, + ) -> (FullSlice, Result<(), Error>) { + self.inner().write_all_at(buf, offset, ctx).await + } + + pub async fn write_all( + &mut self, + buf: FullSlice, + ctx: &RequestContext, + ) -> (FullSlice, Result) { + self.inner_mut().write_all(buf, ctx).await + } +} + /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally /// the underlying file is closed if the system is low on file descriptors, @@ -77,7 +300,7 @@ pub(crate) mod owned_buffers_io { /// 'tag' field is used to detect whether the handle still is valid or not. /// #[derive(Debug)] -pub struct VirtualFile { +pub struct VirtualFileInner { /// Lazy handle to the global file descriptor cache. The slot that this points to /// might contain our File, or it may be empty, or it may contain a File that /// belongs to a different VirtualFile. @@ -350,12 +573,12 @@ macro_rules! with_file { }}; } -impl VirtualFile { +impl VirtualFileInner { /// Open a file in read-only mode. Like File::open. pub async fn open>( path: P, ctx: &RequestContext, - ) -> Result { + ) -> Result { Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } @@ -364,7 +587,7 @@ impl VirtualFile { pub async fn create>( path: P, ctx: &RequestContext, - ) -> Result { + ) -> Result { Self::open_with_options( path.as_ref(), OpenOptions::new().write(true).create(true).truncate(true), @@ -382,7 +605,7 @@ impl VirtualFile { path: P, open_options: &OpenOptions, _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ - ) -> Result { + ) -> Result { let path_ref = path.as_ref(); let path_str = path_ref.to_string(); let parts = path_str.split('/').collect::>(); @@ -423,7 +646,7 @@ impl VirtualFile { reopen_options.create_new(false); reopen_options.truncate(false); - let vfile = VirtualFile { + let vfile = VirtualFileInner { handle: RwLock::new(handle), pos: 0, path: path_ref.to_path_buf(), @@ -1034,6 +1257,21 @@ impl tokio_epoll_uring::IoFd for FileGuard { #[cfg(test)] impl VirtualFile { + pub(crate) async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result, std::io::Error> { + self.inner().read_blk(blknum, ctx).await + } + + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { + self.inner_mut().read_to_end(buf, ctx).await + } +} + +#[cfg(test)] +impl VirtualFileInner { pub(crate) async fn read_blk( &self, blknum: u32, @@ -1067,7 +1305,7 @@ impl VirtualFile { } } -impl Drop for VirtualFile { +impl Drop for VirtualFileInner { /// If a VirtualFile is dropped, close the underlying file if it was open. fn drop(&mut self) { let handle = self.handle.get_mut(); @@ -1216,6 +1454,16 @@ pub(crate) fn get_io_buffer_alignment() -> usize { } } +static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); + +pub(crate) fn set_io_mode(mode: IoMode) { + IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed); +} + +pub(crate) fn get_io_mode() -> IoMode { + let mode = IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap(); + mode +} #[cfg(test)] mod tests { use crate::context::DownloadBehavior; @@ -1339,10 +1587,10 @@ mod tests { impl Adapter for A { async fn open( path: Utf8PathBuf, - opts: OpenOptions, + mut opts: OpenOptions, ctx: &RequestContext, ) -> Result { - let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?; + let vf = VirtualFile::open_with_options(&path, &mut opts, ctx).await?; Ok(MaybeVirtualFile::VirtualFile(vf)) } } @@ -1524,7 +1772,7 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFile::open_with_options( + let f = VirtualFileInner::open_with_options( &test_file_path, OpenOptions::new().read(true), &ctx, @@ -1576,7 +1824,7 @@ mod tests { let path = testdir.join("myfile"); let tmp_path = testdir.join("myfile.tmp"); - VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) + VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); @@ -1585,7 +1833,7 @@ mod tests { assert!(!tmp_path.exists()); drop(file); - VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) + VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); @@ -1608,7 +1856,7 @@ mod tests { std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap(); assert!(tmp_path.exists()); - VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) + VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f5019e39dcd6..cd7d00f17e71 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -401,6 +401,7 @@ def __init__( safekeeper_extra_opts: Optional[list[str]] = None, storage_controller_port_override: Optional[int] = None, pageserver_io_buffer_alignment: Optional[int] = None, + pageserver_virtual_file_io_mode: Optional[str] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -455,6 +456,7 @@ def __init__( self.storage_controller_port_override = storage_controller_port_override self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment + self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode assert test_name.startswith( "test_" @@ -1025,6 +1027,7 @@ def __init__(self, config: NeonEnvBuilder): self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment + self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { @@ -1088,7 +1091,10 @@ def __init__(self, config: NeonEnvBuilder): for key, value in override.items(): ps_cfg[key] = value - ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment + if self.pageserver_io_buffer_alignment is not None: + ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment + if self.pageserver_virtual_file_io_mode is not None: + ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode # Create a corresponding NeonPageserver object self.pageservers.append( @@ -1327,6 +1333,7 @@ def neon_simple_env( pageserver_aux_file_policy: Optional[AuxFileStore], pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], pageserver_io_buffer_alignment: Optional[int], + pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnv]: """ Simple Neon environment, with no authentication and no safekeepers. @@ -1353,6 +1360,7 @@ def neon_simple_env( pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, + pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, ) as builder: env = builder.init_start() @@ -1377,6 +1385,7 @@ def neon_env_builder( pageserver_aux_file_policy: Optional[AuxFileStore], record_property: Callable[[str, object], None], pageserver_io_buffer_alignment: Optional[int], + pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1412,6 +1421,7 @@ def neon_env_builder( pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, + pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, ) as builder: yield builder # Propogate `preserve_database_files` to make it possible to use in other fixtures, diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 2c8e71526cda..6c59e54c47c0 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -39,6 +39,11 @@ def pageserver_io_buffer_alignment() -> Optional[int]: return None +@pytest.fixture(scope="function", autouse=True) +def pageserver_virtual_file_io_mode() -> Optional[str]: + return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") + + @pytest.fixture(scope="function", autouse=True) def pageserver_aux_file_policy() -> Optional[AuxFileStore]: return None From 95554c7377f60b591f7d504b0e71df779c27db43 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 1 Oct 2024 07:59:15 -0400 Subject: [PATCH 02/24] fix clippy Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 302ea04451cd..b0adc354f938 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1461,8 +1461,7 @@ pub(crate) fn set_io_mode(mode: IoMode) { } pub(crate) fn get_io_mode() -> IoMode { - let mode = IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap(); - mode + IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap() } #[cfg(test)] mod tests { @@ -1587,10 +1586,10 @@ mod tests { impl Adapter for A { async fn open( path: Utf8PathBuf, - mut opts: OpenOptions, + opts: OpenOptions, ctx: &RequestContext, ) -> Result { - let vf = VirtualFile::open_with_options(&path, &mut opts, ctx).await?; + let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?; Ok(MaybeVirtualFile::VirtualFile(vf)) } } From 3a5b44ea539b62f3b0a30468badf803cbe885cbd Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 1 Oct 2024 08:16:18 -0400 Subject: [PATCH 03/24] add set_io_mode option to getpage_latest_lsn Signed-off-by: Yuchen Liang --- libs/pageserver_api/src/models.rs | 15 +++++++++++++-- pageserver/client/src/mgmt_api.rs | 5 ++++- .../pagebench/src/cmd/getpage_latest_lsn.rs | 8 ++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 67da2783f5ca..3ec9cac2c321 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -992,8 +992,19 @@ pub mod virtual_file { } /// Direct IO modes for a pageserver. - #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] - #[serde(rename_all = "kebab-case", deny_unknown_fields)] + #[derive( + Copy, + Clone, + PartialEq, + Eq, + Hash, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, + Debug, + )] + #[strum(serialize_all = "kebab-case")] #[repr(u8)] pub enum IoMode { /// Uses buffered IO. diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4c54f4274b7c..2028fa8f3058 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -551,7 +551,10 @@ impl Client { } /// Configs io mode at runtime. - pub async fn put_io_mode(&self, mode: virtual_file::IoMode) -> Result<()> { + pub async fn put_io_mode( + &self, + mode: &pageserver_api::models::virtual_file::IoMode, + ) -> Result<()> { let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint); self.request(Method::PUT, uri, mode) .await? diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index ac4a732377b1..603a8b118aee 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -63,6 +63,10 @@ pub(crate) struct Args { #[clap(long)] set_io_alignment: Option, + /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct). + #[clap(long)] + set_io_mode: Option, + targets: Option>, } @@ -133,6 +137,10 @@ async fn main_impl( mgmt_api_client.put_io_alignment(align).await?; } + if let Some(mode) = &args.set_io_mode { + mgmt_api_client.put_io_mode(mode).await?; + } + // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, From 97f7b0b86f3a9ca869867108a1cfc1a4429dba95 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 1 Oct 2024 08:31:30 -0400 Subject: [PATCH 04/24] simplify virtual file wrapper Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file.rs | 71 +++++----------------------------- 1 file changed, 10 insertions(+), 61 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index b0adc354f938..c84816ac5201 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -95,7 +95,8 @@ impl VirtualFile { path: P, ctx: &RequestContext, ) -> Result { - Self::open_buffered(path, ctx).await + let file = VirtualFileInner::open(path, ctx).await?; + Ok(Self::Buffered(file)) } /// Open a file in read-only mode. Like File::open. @@ -105,39 +106,7 @@ impl VirtualFile { path: P, ctx: &RequestContext, ) -> Result { - match get_io_mode() { - IoMode::Buffered => Self::open_buffered(path, ctx).await, - #[cfg(target_os = "linux")] - IoMode::Direct => Self::open_direct(path, ctx).await, - } - } - - /// Open a file in read-only mode using direct IO. Like File::open. - async fn open_buffered>( - path: P, - ctx: &RequestContext, - ) -> Result { - let file = - VirtualFileInner::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx) - .await?; - Ok(Self::Buffered(file)) - } - - /// Open a file in read-only mode using direct IO. Like File::open with `O_DIRECT` flag. - #[cfg(target_os = "linux")] - async fn open_direct>( - path: P, - ctx: &RequestContext, - ) -> Result { - let file = VirtualFileInner::open_with_options( - path.as_ref(), - OpenOptions::new() - .read(true) - .custom_flags(nix::libc::O_DIRECT), - ctx, - ) - .await?; - Ok(Self::Direct(file)) + Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await } pub async fn create>( @@ -152,32 +121,12 @@ impl VirtualFile { path: P, ctx: &RequestContext, ) -> Result { - let file = match get_io_mode() { - IoMode::Buffered => { - let file = VirtualFileInner::open_with_options( - path.as_ref(), - OpenOptions::new().write(true).create(true).truncate(true), - ctx, - ) - .await?; - Self::Buffered(file) - } - #[cfg(target_os = "linux")] - IoMode::Direct => { - let file = VirtualFileInner::open_with_options( - path.as_ref(), - OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .custom_flags(nix::libc::O_DIRECT), - ctx, - ) - .await?; - Self::Direct(file) - } - }; - Ok(file) + VirtualFile::open_with_options_v2( + path.as_ref(), + OpenOptions::new().write(true).create(true).truncate(true), + ctx, + ) + .await } pub async fn open_with_options>( @@ -191,7 +140,7 @@ impl VirtualFile { pub async fn open_with_options_v2>( path: P, - open_options: &mut OpenOptions, + open_options: &mut OpenOptions, // Uses `&mut` here to add `O_DIRECT`. ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ ) -> Result { let file = match get_io_mode() { From 5c76b2d474ed2ca1f1f76bdb7085909a2aeb21d5 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 1 Oct 2024 10:58:47 -0400 Subject: [PATCH 05/24] fix put_io_mode to use the correct http endpoint Signed-off-by: Yuchen Liang --- pageserver/client/src/mgmt_api.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 2028fa8f3058..c32d7864485b 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -555,7 +555,7 @@ impl Client { &self, mode: &pageserver_api::models::virtual_file::IoMode, ) -> Result<()> { - let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint); + let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint); self.request(Method::PUT, uri, mode) .await? .json() From a04cfd754b2b49099e32f6f24ac3d78acbaffd71 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 7 Oct 2024 12:16:11 -0400 Subject: [PATCH 06/24] get rid of io_buffer_alignment config (always 512) Signed-off-by: Yuchen Liang --- libs/pageserver_api/src/config.rs | 4 - pageserver/benches/bench_ingest.rs | 6 +- pageserver/client/src/mgmt_api.rs | 10 -- pageserver/ctl/src/layer_map_analyzer.rs | 6 +- pageserver/ctl/src/layers.rs | 8 +- pageserver/ctl/src/main.rs | 8 +- .../pagebench/src/cmd/getpage_latest_lsn.rs | 8 -- pageserver/src/bin/pageserver.rs | 9 +- pageserver/src/config.rs | 4 - pageserver/src/http/routes.rs | 17 --- .../src/tenant/storage_layer/delta_layer.rs | 2 - pageserver/src/tenant/vectored_blob_io.rs | 112 +++++++----------- pageserver/src/virtual_file.rs | 47 +------- test_runner/fixtures/neon_fixtures.py | 9 -- test_runner/fixtures/parametrize.py | 5 - 15 files changed, 54 insertions(+), 201 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 7e3cdd4a8949..24474d48405e 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -105,7 +105,6 @@ pub struct ConfigToml { pub ephemeral_bytes_per_memory_kb: usize, pub l0_flush: Option, pub virtual_file_io_mode: Option, - pub io_buffer_alignment: usize, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -389,9 +388,6 @@ impl Default for ConfigToml { ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, - - io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT, - tenant_config: TenantConfigToml::default(), } } diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 72cbb6beabe0..821c8008a950 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -164,11 +164,7 @@ fn criterion_benchmark(c: &mut Criterion) { let conf: &'static PageServerConf = Box::leak(Box::new( pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), )); - virtual_file::init( - 16384, - virtual_file::io_engine_for_bench(), - pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - ); + virtual_file::init(16384, virtual_file::io_engine_for_bench()); page_cache::init(conf.page_cache_size); { diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index c32d7864485b..4d76c66905c4 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -540,16 +540,6 @@ impl Client { .map_err(Error::ReceiveBody) } - /// Configs io buffer alignment at runtime. - pub async fn put_io_alignment(&self, align: usize) -> Result<()> { - let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint); - self.request(Method::PUT, uri, align) - .await? - .json() - .await - .map_err(Error::ReceiveBody) - } - /// Configs io mode at runtime. pub async fn put_io_mode( &self, diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index adc090823d84..151b94cf62d3 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -152,11 +152,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init( - 10, - virtual_file::api::IoEngineKind::StdFs, - pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - ); + pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index dd753398e2fa..fd948bf2efed 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,7 +59,7 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); @@ -190,11 +190,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init( - 10, - virtual_file::api::IoEngineKind::StdFs, - pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, - ); + pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index cf001ef0d5d4..c96664d346cb 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -26,7 +26,7 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; -use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId}; +use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; @@ -205,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init( - 10, - virtual_file::api::IoEngineKind::StdFs, - DEFAULT_IO_BUFFER_ALIGNMENT, - ); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 603a8b118aee..b2df01714d31 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -59,10 +59,6 @@ pub(crate) struct Args { #[clap(long)] set_io_engine: Option, - /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers. - #[clap(long)] - set_io_alignment: Option, - /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct). #[clap(long)] set_io_mode: Option, @@ -133,10 +129,6 @@ async fn main_impl( mgmt_api_client.put_io_engine(engine_str).await?; } - if let Some(align) = args.set_io_alignment { - mgmt_api_client.put_io_alignment(align).await?; - } - if let Some(mode) = &args.set_io_mode { mgmt_api_client.put_io_mode(mode).await?; } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 5e9bffb050a9..f71a3d26531c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -125,8 +125,7 @@ fn main() -> anyhow::Result<()> { // after setting up logging, log the effective IO engine choice and read path implementations info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); - info!(?conf.virtual_file_io_mode, "starting with virtual_file Direct IO settings"); - info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment"); + info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. @@ -168,11 +167,7 @@ fn main() -> anyhow::Result<()> { let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init( - conf.max_file_descriptors, - conf.virtual_file_io_engine, - conf.io_buffer_alignment, - ); + virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6e0acd52e219..8db78285e476 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -175,8 +175,6 @@ pub struct PageServerConf { /// Direct IO settings pub virtual_file_io_mode: virtual_file::IoMode, - - pub io_buffer_alignment: usize, } /// Token for authentication to safekeepers @@ -329,7 +327,6 @@ impl PageServerConf { concurrent_tenant_warmup, concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, - io_buffer_alignment, tenant_config, } = config_toml; @@ -368,7 +365,6 @@ impl PageServerConf { max_vectored_read_bytes, image_compression, ephemeral_bytes_per_memory_kb, - io_buffer_alignment, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0f965bd612ec..a5dcb430de18 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2380,20 +2380,6 @@ async fn put_io_engine_handler( json_response(StatusCode::OK, ()) } -async fn put_io_alignment_handler( - mut r: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - check_permission(&r, None)?; - let align: usize = json_request(&mut r).await?; - crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| { - ApiError::PreconditionFailed( - format!("Requested io alignment ({align}) is not a power of two").into(), - ) - })?; - json_response(StatusCode::OK, ()) -} - async fn put_io_mode_handler( mut r: Request, _cancel: CancellationToken, @@ -3091,9 +3077,6 @@ pub fn make_router( |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) - .put("/v1/io_alignment", |r| { - api_handler(r, put_io_alignment_handler) - }) .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler)) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 5621aae3f378..8be7d7876f82 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1198,7 +1198,6 @@ impl DeltaLayerInner { let mut prev: Option<(Key, Lsn, BlobRef)> = None; let mut read_builder: Option = None; - let align = virtual_file::get_io_buffer_alignment(); let max_read_size = self .max_vectored_read_bytes @@ -1247,7 +1246,6 @@ impl DeltaLayerInner { offsets.end.pos(), meta, max_read_size, - align, )) } } else { diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 1faa6bab99e0..792c769b4fc0 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -194,8 +194,6 @@ pub(crate) struct ChunkedVectoredReadBuilder { /// Start offset and metadata for each blob in this read blobs_at: VecMap, max_read_size: Option, - /// Chunk size reads are coalesced into. - chunk_size: usize, } /// Computes x / d rounded up. @@ -204,6 +202,7 @@ fn div_round_up(x: usize, d: usize) -> usize { } impl ChunkedVectoredReadBuilder { + const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment(); /// Start building a new vectored read. /// /// Note that by design, this does not check against reading more than `max_read_size` to @@ -214,21 +213,19 @@ impl ChunkedVectoredReadBuilder { end_offset: u64, meta: BlobMeta, max_read_size: Option, - chunk_size: usize, ) -> Self { let mut blobs_at = VecMap::default(); blobs_at .append(start_offset, meta) .expect("First insertion always succeeds"); - let start_blk_no = start_offset as usize / chunk_size; - let end_blk_no = div_round_up(end_offset as usize, chunk_size); + let start_blk_no = start_offset as usize / Self::CHUNK_SIZE; + let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE); Self { start_blk_no, end_blk_no, blobs_at, max_read_size, - chunk_size, } } @@ -237,18 +234,12 @@ impl ChunkedVectoredReadBuilder { end_offset: u64, meta: BlobMeta, max_read_size: usize, - align: usize, ) -> Self { - Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align) + Self::new_impl(start_offset, end_offset, meta, Some(max_read_size)) } - pub(crate) fn new_streaming( - start_offset: u64, - end_offset: u64, - meta: BlobMeta, - align: usize, - ) -> Self { - Self::new_impl(start_offset, end_offset, meta, None, align) + pub(crate) fn new_streaming(start_offset: u64, end_offset: u64, meta: BlobMeta) -> Self { + Self::new_impl(start_offset, end_offset, meta, None) } /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk. @@ -256,12 +247,12 @@ impl ChunkedVectoredReadBuilder { /// The resulting size also must be below the max read size. pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { tracing::trace!(start, end, "trying to extend"); - let start_blk_no = start as usize / self.chunk_size; - let end_blk_no = div_round_up(end as usize, self.chunk_size); + let start_blk_no = start as usize / Self::CHUNK_SIZE; + let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE); let not_limited_by_max_read_size = { if let Some(max_read_size) = self.max_read_size { - let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size; + let coalesced_size = (end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE; coalesced_size <= max_read_size } else { true @@ -292,12 +283,12 @@ impl ChunkedVectoredReadBuilder { } pub(crate) fn size(&self) -> usize { - (self.end_blk_no - self.start_blk_no) * self.chunk_size + (self.end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE } pub(crate) fn build(self) -> VectoredRead { - let start = (self.start_blk_no * self.chunk_size) as u64; - let end = (self.end_blk_no * self.chunk_size) as u64; + let start = (self.start_blk_no * Self::CHUNK_SIZE) as u64; + let end = (self.end_blk_no * Self::CHUNK_SIZE) as u64; VectoredRead { start, end, @@ -328,18 +319,14 @@ pub struct VectoredReadPlanner { prev: Option<(Key, Lsn, u64, BlobFlag)>, max_read_size: usize, - - align: usize, } impl VectoredReadPlanner { pub fn new(max_read_size: usize) -> Self { - let align = virtual_file::get_io_buffer_alignment(); Self { blobs: BTreeMap::new(), prev: None, max_read_size, - align, } } @@ -418,7 +405,6 @@ impl VectoredReadPlanner { end_offset, BlobMeta { key, lsn }, self.max_read_size, - self.align, ); let prev_read_builder = current_read_builder.replace(next_read_builder); @@ -472,13 +458,13 @@ impl<'a> VectoredBlobReader<'a> { ); if cfg!(debug_assertions) { - let align = virtual_file::get_io_buffer_alignment() as u64; + const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; debug_assert_eq!( - read.start % align, + read.start % ALIGN, 0, "Read start at {} does not satisfy the required io buffer alignment ({} bytes)", read.start, - align + ALIGN ); } @@ -553,22 +539,18 @@ pub struct StreamingVectoredReadPlanner { max_cnt: usize, /// Size of the current batch cnt: usize, - - align: usize, } impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); assert!(max_read_size > 0); - let align = virtual_file::get_io_buffer_alignment(); Self { read_builder: None, prev: None, max_cnt, max_read_size, cnt: 0, - align, } } @@ -621,7 +603,6 @@ impl StreamingVectoredReadPlanner { start_offset, end_offset, BlobMeta { key, lsn }, - self.align, )) }; } @@ -656,9 +637,9 @@ mod tests { use super::*; fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { - let align = virtual_file::get_io_buffer_alignment() as u64; - assert_eq!(read.start % align, 0); - assert_eq!(read.start / align, offset_range.first().unwrap().2 / align); + const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64; + assert_eq!(read.start % ALIGN, 0); + assert_eq!(read.start / ALIGN, offset_range.first().unwrap().2 / ALIGN); let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); @@ -676,32 +657,27 @@ mod tests { fn planner_chunked_coalesce_all_test() { use crate::virtual_file; - let chunk_size = virtual_file::get_io_buffer_alignment() as u64; - - // The test explicitly does not check chunk size < 512 - if chunk_size < 512 { - return; - } + const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64; - let max_read_size = chunk_size as usize * 8; + let max_read_size = CHUNK_SIZE as usize * 8; let key = Key::MIN; let lsn = Lsn(0); let blob_descriptions = [ - (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN - (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap - (key, lsn, chunk_size / 2, BlobFlag::None), - (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap - (key, lsn, chunk_size, BlobFlag::None), - (key, lsn, chunk_size * 2 - 1, BlobFlag::None), - (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap - (key, lsn, chunk_size * 3 + 1, BlobFlag::None), - (key, lsn, chunk_size * 5 + 1, BlobFlag::None), - (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce. - (key, lsn, chunk_size * 7 + 1, BlobFlag::None), - (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size) - (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk - (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce) + (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN + (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap + (key, lsn, CHUNK_SIZE / 2, BlobFlag::None), + (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap + (key, lsn, CHUNK_SIZE, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap + (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce. + (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None), + (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size) + (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk + (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce) ]; let ranges = [ @@ -780,19 +756,19 @@ mod tests { #[test] fn planner_replacement_test() { - let chunk_size = virtual_file::get_io_buffer_alignment() as u64; - let max_read_size = 128 * chunk_size as usize; + const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64; + let max_read_size = 128 * CHUNK_SIZE as usize; let first_key = Key::MIN; let second_key = first_key.next(); let lsn = Lsn(0); let blob_descriptions = vec![ (first_key, lsn, 0, BlobFlag::None), // First in read 1 - (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1 - (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll), - (second_key, lsn, 3 * chunk_size, BlobFlag::None), - (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2 - (second_key, lsn, 5 * chunk_size, BlobFlag::None), // Last in read 2 + (first_key, lsn, CHUNK_SIZE, BlobFlag::None), // Last in read 1 + (second_key, lsn, 2 * CHUNK_SIZE, BlobFlag::ReplaceAll), + (second_key, lsn, 3 * CHUNK_SIZE, BlobFlag::None), + (second_key, lsn, 4 * CHUNK_SIZE, BlobFlag::ReplaceAll), // First in read 2 + (second_key, lsn, 5 * CHUNK_SIZE, BlobFlag::None), // Last in read 2 ]; let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; @@ -802,7 +778,7 @@ mod tests { planner.handle(key, lsn, offset, flag); } - planner.handle_range_end(6 * chunk_size); + planner.handle_range_end(6 * CHUNK_SIZE); let reads = planner.finish(); assert_eq!(reads.len(), 2); @@ -947,7 +923,6 @@ mod tests { let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; let mut buf = BytesMut::with_capacity(reserved_bytes); - let align = virtual_file::get_io_buffer_alignment(); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { key: Key::MIN, @@ -959,8 +934,7 @@ mod tests { if idx + 1 == offsets.len() { continue; } - let read_builder = - ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align); + let read_builder = ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096); let read = read_builder.build(); let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; assert_eq!(result.blobs.len(), 1); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index c84816ac5201..2cd11f8f0461 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1330,15 +1330,10 @@ impl OpenFiles { /// server startup. /// #[cfg(not(test))] -pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) { +pub fn init(num_slots: usize, engine: IoEngineKind) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } - if set_io_buffer_alignment(io_buffer_alignment).is_err() { - panic!( - "IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}" - ); - } io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -1362,45 +1357,9 @@ fn get_open_files() -> &'static OpenFiles { } } -static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT); - -/// Returns true if the alignment is a power of two and is greater or equal to 512. -fn is_valid_io_buffer_alignment(align: usize) -> bool { - align.is_power_of_two() && align >= 512 -} - -/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is -/// not a power of two or less than 512 bytes. -#[allow(unused)] -pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { - if is_valid_io_buffer_alignment(align) { - IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed); - Ok(()) - } else { - Err(align) - } -} - /// Gets the io buffer alignment. -/// -/// This function should be used for getting the actual alignment value to use. -pub(crate) fn get_io_buffer_alignment() -> usize { - let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed); - - if cfg!(test) { - let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT"; - if let Some(test_align) = utils::env::var(env_var_name) { - if is_valid_io_buffer_alignment(test_align) { - test_align - } else { - panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}"); - } - } else { - align - } - } else { - align - } +pub(crate) const fn get_io_buffer_alignment() -> usize { + DEFAULT_IO_BUFFER_ALIGNMENT } static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 11a62e258d49..ad8ac037b3f6 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -398,7 +398,6 @@ def __init__( pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, safekeeper_extra_opts: Optional[list[str]] = None, storage_controller_port_override: Optional[int] = None, - pageserver_io_buffer_alignment: Optional[int] = None, pageserver_virtual_file_io_mode: Optional[str] = None, ): self.repo_dir = repo_dir @@ -453,7 +452,6 @@ def __init__( self.storage_controller_port_override = storage_controller_port_override - self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode assert test_name.startswith( @@ -1043,7 +1041,6 @@ def __init__(self, config: NeonEnvBuilder): self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy - self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode # Create the neon_local's `NeonLocalInitConf` @@ -1108,8 +1105,6 @@ def __init__(self, config: NeonEnvBuilder): for key, value in override.items(): ps_cfg[key] = value - if self.pageserver_io_buffer_alignment is not None: - ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment if self.pageserver_virtual_file_io_mode is not None: ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode @@ -1417,7 +1412,6 @@ def neon_simple_env( pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore], pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], - pageserver_io_buffer_alignment: Optional[int], pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnv]: """ @@ -1444,7 +1438,6 @@ def neon_simple_env( pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, - pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, ) as builder: env = builder.init_start() @@ -1469,7 +1462,6 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], pageserver_aux_file_policy: Optional[AuxFileStore], record_property: Callable[[str, object], None], - pageserver_io_buffer_alignment: Optional[int], pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnvBuilder]: """ @@ -1505,7 +1497,6 @@ def neon_env_builder( test_overlay_dir=test_overlay_dir, pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, - pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, ) as builder: yield builder diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 6c59e54c47c0..d636d343942b 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -34,11 +34,6 @@ def pageserver_virtual_file_io_engine() -> Optional[str]: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") -@pytest.fixture(scope="function", autouse=True) -def pageserver_io_buffer_alignment() -> Optional[int]: - return None - - @pytest.fixture(scope="function", autouse=True) def pageserver_virtual_file_io_mode() -> Optional[str]: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") From 4e1309475c5761e5b72d017f8a709a4455000dfe Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 8 Oct 2024 12:57:58 -0400 Subject: [PATCH 07/24] review: clone open_options instead of taking mut Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 2cd11f8f0461..55d5e57cfc4b 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -140,7 +140,7 @@ impl VirtualFile { pub async fn open_with_options_v2>( path: P, - open_options: &mut OpenOptions, // Uses `&mut` here to add `O_DIRECT`. + open_options: &OpenOptions, ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ ) -> Result { let file = match get_io_mode() { @@ -152,7 +152,7 @@ impl VirtualFile { IoMode::Direct => { let file = VirtualFileInner::open_with_options( path, - open_options.custom_flags(nix::libc::O_DIRECT), + open_options.clone().custom_flags(nix::libc::O_DIRECT), ctx, ) .await?; From 6d03e2810d66b2a4cf2e8390cc5cb7037174335b Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 8 Oct 2024 17:21:58 +0000 Subject: [PATCH 08/24] review: use a inner and mode member for VirtualFile Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file.rs | 85 ++++++++++++++++------------------ 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 55d5e57cfc4b..d260116b3892 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -64,39 +64,22 @@ pub(crate) mod owned_buffers_io { } #[derive(Debug)] -pub enum VirtualFile { - Buffered(VirtualFileInner), - Direct(VirtualFileInner), +pub struct VirtualFile { + inner: VirtualFileInner, + _mode: IoMode, } impl VirtualFile { - fn inner(&self) -> &VirtualFileInner { - match self { - Self::Buffered(file) => file, - Self::Direct(file) => file, - } - } - - fn inner_mut(&mut self) -> &mut VirtualFileInner { - match self { - Self::Buffered(file) => file, - Self::Direct(file) => file, - } - } - - fn into_inner(self) -> VirtualFileInner { - match self { - Self::Buffered(file) => file, - Self::Direct(file) => file, - } - } /// Open a file in read-only mode. Like File::open. pub async fn open>( path: P, ctx: &RequestContext, ) -> Result { - let file = VirtualFileInner::open(path, ctx).await?; - Ok(Self::Buffered(file)) + let inner = VirtualFileInner::open(path, ctx).await?; + Ok(VirtualFile { + inner, + _mode: IoMode::Buffered, + }) } /// Open a file in read-only mode. Like File::open. @@ -113,8 +96,11 @@ impl VirtualFile { path: P, ctx: &RequestContext, ) -> Result { - let file = VirtualFileInner::create(path, ctx).await?; - Ok(Self::Buffered(file)) + let inner = VirtualFileInner::create(path, ctx).await?; + Ok(VirtualFile { + inner, + _mode: IoMode::Buffered, + }) } pub async fn create_v2>( @@ -134,8 +120,11 @@ impl VirtualFile { open_options: &OpenOptions, ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ ) -> Result { - let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?; - Ok(Self::Buffered(file)) + let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; + Ok(VirtualFile { + inner, + _mode: IoMode::Buffered, + }) } pub async fn open_with_options_v2>( @@ -145,25 +134,31 @@ impl VirtualFile { ) -> Result { let file = match get_io_mode() { IoMode::Buffered => { - let file = VirtualFileInner::open_with_options(path, open_options, ctx).await?; - Self::Buffered(file) + let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; + VirtualFile { + inner, + _mode: IoMode::Buffered, + } } #[cfg(target_os = "linux")] IoMode::Direct => { - let file = VirtualFileInner::open_with_options( + let inner = VirtualFileInner::open_with_options( path, open_options.clone().custom_flags(nix::libc::O_DIRECT), ctx, ) .await?; - Self::Direct(file) + VirtualFile { + inner, + _mode: IoMode::Direct, + } } }; Ok(file) } pub fn path(&self) -> &Utf8Path { - self.inner().path.as_path() + self.inner.path.as_path() } pub async fn crashsafe_overwrite + Send, Buf: IoBuf + Send>( @@ -175,23 +170,23 @@ impl VirtualFile { } pub async fn sync_all(&self) -> Result<(), Error> { - self.inner().sync_all().await + self.inner.sync_all().await } pub async fn sync_data(&self) -> Result<(), Error> { - self.inner().sync_data().await + self.inner.sync_data().await } pub async fn metadata(&self) -> Result { - self.inner().metadata().await + self.inner.metadata().await } pub fn remove(self) { - self.into_inner().remove(); + self.inner.remove(); } pub async fn seek(&mut self, pos: SeekFrom) -> Result { - self.inner_mut().seek(pos).await + self.inner.seek(pos).await } pub async fn read_exact_at( @@ -203,7 +198,7 @@ impl VirtualFile { where Buf: IoBufMut + Send, { - self.inner().read_exact_at(slice, offset, ctx).await + self.inner.read_exact_at(slice, offset, ctx).await } pub async fn read_exact_at_page( @@ -212,7 +207,7 @@ impl VirtualFile { offset: u64, ctx: &RequestContext, ) -> Result, Error> { - self.inner().read_exact_at_page(page, offset, ctx).await + self.inner.read_exact_at_page(page, offset, ctx).await } pub async fn write_all_at( @@ -221,7 +216,7 @@ impl VirtualFile { offset: u64, ctx: &RequestContext, ) -> (FullSlice, Result<(), Error>) { - self.inner().write_all_at(buf, offset, ctx).await + self.inner.write_all_at(buf, offset, ctx).await } pub async fn write_all( @@ -229,7 +224,7 @@ impl VirtualFile { buf: FullSlice, ctx: &RequestContext, ) -> (FullSlice, Result) { - self.inner_mut().write_all(buf, ctx).await + self.inner.write_all(buf, ctx).await } } @@ -1211,11 +1206,11 @@ impl VirtualFile { blknum: u32, ctx: &RequestContext, ) -> Result, std::io::Error> { - self.inner().read_blk(blknum, ctx).await + self.inner.read_blk(blknum, ctx).await } async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { - self.inner_mut().read_to_end(buf, ctx).await + self.inner.read_to_end(buf, ctx).await } } From ee4600034e3f1691eb10497a01397aed6abe08b1 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 7 Oct 2024 15:16:32 -0400 Subject: [PATCH 09/24] pageserver: implement aligned io buffer Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file.rs | 4 + pageserver/src/virtual_file/dio.rs | 405 ++++++++++++++++++ .../owned_buffers_io/io_buf_aligned.rs | 9 + .../owned_buffers_io/io_buf_ext.rs | 2 + 4 files changed, 420 insertions(+) create mode 100644 pageserver/src/virtual_file/dio.rs create mode 100644 pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index d260116b3892..75a355154e46 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -44,6 +44,7 @@ pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; +pub(crate) mod dio; pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. @@ -55,6 +56,7 @@ pub(crate) mod owned_buffers_io { //! but for the time being we're proving out the primitives in the neon.git repo //! for faster iteration. + pub(crate) mod io_buf_aligned; pub(crate) mod io_buf_ext; pub(crate) mod slice; pub(crate) mod write; @@ -1357,6 +1359,8 @@ pub(crate) const fn get_io_buffer_alignment() -> usize { DEFAULT_IO_BUFFER_ALIGNMENT } +pub(crate) type IoBufferMut = dio::AlignedBufferMut<{ get_io_buffer_alignment() }>; + static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); pub(crate) fn set_io_mode(mode: IoMode) { diff --git a/pageserver/src/virtual_file/dio.rs b/pageserver/src/virtual_file/dio.rs new file mode 100644 index 000000000000..16497c8951f4 --- /dev/null +++ b/pageserver/src/virtual_file/dio.rs @@ -0,0 +1,405 @@ +#![allow(unused)] + +use core::slice; +use std::{ + alloc::{self, Layout}, + cmp, + mem::{ManuallyDrop, MaybeUninit}, + ops::{Deref, DerefMut}, + ptr::{addr_of_mut, NonNull}, +}; + +use bytes::buf::UninitSlice; + +struct IoBufferPtr(*mut u8); + +// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. +unsafe impl Send for IoBufferPtr {} + +/// An aligned buffer type used for I/O. +pub struct AlignedBufferMut { + ptr: IoBufferPtr, + capacity: usize, + len: usize, +} + +impl AlignedBufferMut { + /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment. + /// + /// The buffer will be able to hold at most `capacity` elements and will never resize. + /// + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met: + /// * `align` must not be zero, + /// + /// * `align` must be a power of two, + /// + /// * `capacity`, when rounded up to the nearest multiple of `align`, + /// must not overflow isize (i.e., the rounded value must be + /// less than or equal to `isize::MAX`). + pub fn with_capacity(capacity: usize) -> Self { + let layout = Layout::from_size_align(capacity, ALIGN).expect("Invalid layout"); + + // SAFETY: Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout. + let ptr = unsafe { + let ptr = alloc::alloc(layout); + if ptr.is_null() { + alloc::handle_alloc_error(layout); + } + IoBufferPtr(ptr) + }; + + AlignedBufferMut { + ptr, + capacity, + len: 0, + } + } + + /// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros. + pub fn with_capacity_zeroed(capacity: usize) -> Self { + use bytes::BufMut; + let mut buf = Self::with_capacity(capacity); + buf.put_bytes(0, capacity); + buf.len = capacity; + buf + } + + /// Returns the total number of bytes the buffer can hold. + #[inline] + pub fn capacity(&self) -> usize { + self.capacity + } + + /// Returns the alignment of the buffer. + #[inline] + pub const fn align(&self) -> usize { + ALIGN + } + + /// Returns the number of bytes in the buffer, also referred to as its 'length'. + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Force the length of the buffer to `new_len`. + #[inline] + unsafe fn set_len(&mut self, new_len: usize) { + debug_assert!(new_len <= self.capacity()); + self.len = new_len; + } + + #[inline] + fn as_ptr(&self) -> *const u8 { + self.ptr.0 + } + + #[inline] + fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr.0 + } + + /// Extracts a slice containing the entire buffer. + /// + /// Equivalent to `&s[..]`. + #[inline] + fn as_slice(&self) -> &[u8] { + // SAFETY: The pointer is valid and `len` bytes are initialized. + unsafe { slice::from_raw_parts(self.as_ptr(), self.len) } + } + + /// Extracts a mutable slice of the entire buffer. + /// + /// Equivalent to `&mut s[..]`. + fn as_mut_slice(&mut self) -> &mut [u8] { + // SAFETY: The pointer is valid and `len` bytes are initialized. + unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) } + } + + /// Drops the all the contents of the buffer, setting its length to `0`. + #[inline] + pub fn clear(&mut self) { + self.len = 0; + } + + /// Reserves capacity for at least `additional` more bytes to be inserted + /// in the given `IoBufferMut`. The collection may reserve more space to + /// speculatively avoid frequent reallocations. After calling `reserve`, + /// capacity will be greater than or equal to `self.len() + additional`. + /// Does nothing if capacity is already sufficient. + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` _bytes_. + pub fn reserve(&mut self, additional: usize) { + if additional > self.capacity() - self.len() { + self.reserve_inner(additional); + } + } + + fn reserve_inner(&mut self, additional: usize) { + let Some(required_cap) = self.len().checked_add(additional) else { + capacity_overflow() + }; + + let old_capacity = self.capacity(); + let align = self.align(); + // This guarantees exponential growth. The doubling cannot overflow + // because `cap <= isize::MAX` and the type of `cap` is `usize`. + let cap = cmp::max(old_capacity * 2, required_cap); + + if !is_valid_alloc(cap) { + capacity_overflow() + } + let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout"); + + let old_ptr = self.as_mut_ptr(); + + // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout, + // and we panics on null pointer. + let (ptr, cap) = unsafe { + let old_layout = Layout::from_size_align_unchecked(old_capacity, align); + let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size()); + if ptr.is_null() { + alloc::handle_alloc_error(new_layout); + } + (IoBufferPtr(ptr), cap) + }; + + self.ptr = ptr; + self.capacity = cap; + } + + /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8]. + pub fn leak<'a>(self) -> &'a mut [u8] { + let mut buf = ManuallyDrop::new(self); + // SAFETY: leaking the buffer as intended. + unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) } + } +} + +fn capacity_overflow() -> ! { + panic!("capacity overflow") +} + +// We need to guarantee the following: +// * We don't ever allocate `> isize::MAX` byte-size objects. +// * We don't overflow `usize::MAX` and actually allocate too little. +// +// On 64-bit we just need to check for overflow since trying to allocate +// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add +// an extra guard for this in case we're running on a platform which can use +// all 4GB in user-space, e.g., PAE or x32. +#[inline] +fn is_valid_alloc(alloc_size: usize) -> bool { + !(usize::BITS < 64 && alloc_size > isize::MAX as usize) +} + +impl Drop for AlignedBufferMut { + fn drop(&mut self) { + // SAFETY: memory was allocated with std::alloc::alloc with the same layout. + unsafe { + alloc::dealloc( + self.as_mut_ptr(), + Layout::from_size_align_unchecked(self.capacity, ALIGN), + ) + } + } +} + +impl Deref for AlignedBufferMut { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl DerefMut for AlignedBufferMut { + fn deref_mut(&mut self) -> &mut Self::Target { + self.as_mut_slice() + } +} + +/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized. +unsafe impl bytes::BufMut for AlignedBufferMut { + #[inline] + fn remaining_mut(&self) -> usize { + // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`. + // Thus, it can have at most `self.capacity` bytes. + self.capacity() - self.len() + } + + // SAFETY: Caller needs to make sure the bytes being advanced past have been initialized. + #[inline] + unsafe fn advance_mut(&mut self, cnt: usize) { + let len = self.len(); + let remaining = self.remaining_mut(); + + if remaining < cnt { + panic_advance(cnt, remaining); + } + + // Addition will not overflow since the sum is at most the capacity. + self.set_len(len + cnt); + } + + #[inline] + fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice { + let cap = self.capacity(); + let len = self.len(); + + // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be + // valid for `cap - len` bytes. The subtraction will not underflow since + // `len <= cap`. + unsafe { UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) } + } +} + +/// Panic with a nice error message. +#[cold] +fn panic_advance(idx: usize, len: usize) -> ! { + panic!( + "advance out of bounds: the len is {} but advancing by {}", + len, idx + ); +} + +/// Safety: [`IoBufferMut`] has exclusive ownership of the io buffer, +/// and the location remains stable even if [`Self`] is moved. +unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut { + fn stable_ptr(&self) -> *const u8 { + self.as_ptr() + } + + fn bytes_init(&self) -> usize { + self.len() + } + + fn bytes_total(&self) -> usize { + self.capacity() + } +} + +// SAFETY: See above. +unsafe impl tokio_epoll_uring::IoBufMut for AlignedBufferMut { + fn stable_mut_ptr(&mut self) -> *mut u8 { + self.as_mut_ptr() + } + + unsafe fn set_init(&mut self, init_len: usize) { + if self.len() < init_len { + self.set_len(init_len); + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + const ALIGN: usize = 4 * 1024; + type TestIoBufferMut = AlignedBufferMut; + + #[test] + fn test_with_capacity() { + let v = TestIoBufferMut::with_capacity(ALIGN * 4); + assert_eq!(v.len(), 0); + assert_eq!(v.capacity(), ALIGN * 4); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + + let v = TestIoBufferMut::with_capacity(ALIGN / 2); + assert_eq!(v.len(), 0); + assert_eq!(v.capacity(), ALIGN / 2); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + } + + #[test] + fn test_with_capacity_zeroed() { + let v = TestIoBufferMut::with_capacity_zeroed(ALIGN); + assert_eq!(v.len(), ALIGN); + assert_eq!(v.capacity(), ALIGN); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + assert_eq!(&v[..], &[0; ALIGN]) + } + + #[test] + fn test_reserve() { + use bytes::BufMut; + let mut v = TestIoBufferMut::with_capacity(ALIGN); + let capacity = v.capacity(); + v.reserve(capacity); + assert_eq!(v.capacity(), capacity); + let data = [b'a'; ALIGN]; + v.put(&data[..]); + v.reserve(capacity); + assert!(v.capacity() >= capacity * 2); + assert_eq!(&v[..], &data[..]); + let capacity = v.capacity(); + v.clear(); + v.reserve(capacity); + assert_eq!(capacity, v.capacity()); + } + + #[test] + fn test_bytes_put() { + use bytes::BufMut; + let mut v = TestIoBufferMut::with_capacity(ALIGN * 4); + let x = [b'a'; ALIGN]; + + for _ in 0..2 { + for _ in 0..4 { + v.put(&x[..]); + } + assert_eq!(v.len(), ALIGN * 4); + assert_eq!(v.capacity(), ALIGN * 4); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + v.clear() + } + assert_eq!(v.len(), 0); + assert_eq!(v.capacity(), ALIGN * 4); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + } + + #[test] + #[should_panic] + fn test_bytes_put_panic() { + use bytes::BufMut; + const ALIGN: usize = 4 * 1024; + let mut v = TestIoBufferMut::with_capacity(ALIGN * 4); + let x = [b'a'; ALIGN]; + for _ in 0..5 { + v.put_slice(&x[..]); + } + } + + #[test] + fn test_io_buf_put_slice() { + use tokio_epoll_uring::BoundedBufMut; + const ALIGN: usize = 4 * 1024; + let mut v = TestIoBufferMut::with_capacity(ALIGN); + let x = [b'a'; ALIGN]; + + for _ in 0..2 { + v.put_slice(&x[..]); + assert_eq!(v.len(), ALIGN); + assert_eq!(v.capacity(), ALIGN); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + v.clear() + } + assert_eq!(v.len(), 0); + assert_eq!(v.capacity(), ALIGN); + assert_eq!(v.align(), ALIGN); + assert_eq!(v.as_ptr().align_offset(ALIGN), 0); + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs new file mode 100644 index 000000000000..d79287fa69e7 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -0,0 +1,9 @@ +#![allow(unused)] + +use tokio_epoll_uring::IoBufMut; + +use crate::virtual_file::IoBufferMut; + +pub(crate) trait IoBufAlignedMut: IoBufMut {} + +impl IoBufAlignedMut for IoBufferMut {} diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index 7c773b6b2103..cd1917b580fa 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -1,5 +1,6 @@ //! See [`FullSlice`]. +use crate::virtual_file::IoBufferMut; use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; @@ -76,3 +77,4 @@ macro_rules! impl_io_buf_ext { impl_io_buf_ext!(Bytes); impl_io_buf_ext!(BytesMut); impl_io_buf_ext!(Vec); +impl_io_buf_ext!(IoBufferMut); From f28dd95022f08fa8dbe15af58e29ca63e372bd8c Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 7 Oct 2024 16:05:43 -0400 Subject: [PATCH 10/24] use aligned buffer for image and delta layers Signed-off-by: Yuchen Liang --- .../src/tenant/storage_layer/delta_layer.rs | 15 +++++++-------- .../src/tenant/storage_layer/image_layer.rs | 19 +++++++++---------- pageserver/src/tenant/vectored_blob_io.rs | 9 +++++---- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 8be7d7876f82..42436699a7ba 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -44,11 +44,11 @@ use crate::tenant::vectored_blob_io::{ }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; +use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; @@ -1003,7 +1003,7 @@ impl DeltaLayerInner { .0 .into(); let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); - let mut buf = Some(BytesMut::with_capacity(buf_size)); + let mut buf = Some(IoBufferMut::with_capacity(buf_size)); // Note that reads are processed in reverse order (from highest key+lsn). // This is the order that `ReconstructState` requires such that it can @@ -1030,7 +1030,7 @@ impl DeltaLayerInner { // We have "lost" the buffer since the lower level IO api // doesn't return the buffer on error. Allocate a new one. - buf = Some(BytesMut::with_capacity(buf_size)); + buf = Some(IoBufferMut::with_capacity(buf_size)); continue; } @@ -1204,7 +1204,7 @@ impl DeltaLayerInner { .map(|x| x.0.get()) .unwrap_or(8192); - let mut buffer = Some(BytesMut::with_capacity(max_read_size)); + let mut buffer = Some(IoBufferMut::with_capacity(max_read_size)); // FIXME: buffering of DeltaLayerWriter let mut per_blob_copy = Vec::new(); @@ -1562,12 +1562,11 @@ impl<'a> DeltaLayerIterator<'a> { let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file); let mut next_batch = std::collections::VecDeque::new(); let buf_size = plan.size(); - let buf = BytesMut::with_capacity(buf_size); + let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader .read_blobs(&plan, buf, self.ctx) .await?; - let frozen_buf = blobs_buf.buf.freeze(); - let view = BufView::new_bytes(frozen_buf); + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let blob_read = meta.read(&view).await?; let value = Value::des(&blob_read)?; @@ -1942,7 +1941,7 @@ pub(crate) mod test { &vectored_reads, constants::MAX_VECTORED_READ_BYTES, ); - let mut buf = Some(BytesMut::with_capacity(buf_size)); + let mut buf = Some(IoBufferMut::with_capacity(buf_size)); for read in vectored_reads { let blobs_buf = vectored_blob_reader diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index de8155f455d1..5773ae467fcd 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -41,10 +41,11 @@ use crate::tenant::vectored_blob_io::{ }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; +use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::{Bytes, BytesMut}; +use bytes::Bytes; use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; @@ -547,10 +548,10 @@ impl ImageLayerInner { for read in plan.into_iter() { let buf_size = read.size(); - let buf = BytesMut::with_capacity(buf_size); + let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?; - let frozen_buf = blobs_buf.buf.freeze(); - let view = BufView::new_bytes(frozen_buf); + + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let img_buf = meta.read(&view).await?; @@ -609,13 +610,12 @@ impl ImageLayerInner { } } - let buf = BytesMut::with_capacity(buf_size); + let buf = IoBufferMut::with_capacity(buf_size); let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; match res { Ok(blobs_buf) => { - let frozen_buf = blobs_buf.buf.freeze(); - let view = BufView::new_bytes(frozen_buf); + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let img_buf = meta.read(&view).await; @@ -1051,12 +1051,11 @@ impl<'a> ImageLayerIterator<'a> { let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file); let mut next_batch = std::collections::VecDeque::new(); let buf_size = plan.size(); - let buf = BytesMut::with_capacity(buf_size); + let buf = IoBufferMut::with_capacity(buf_size); let blobs_buf = vectored_blob_reader .read_blobs(&plan, buf, self.ctx) .await?; - let frozen_buf = blobs_buf.buf.freeze(); - let view = BufView::new_bytes(frozen_buf); + let view = BufView::new_slice(&blobs_buf.buf); for meta in blobs_buf.blobs.iter() { let img_buf = meta.read(&view).await?; next_batch.push_back(( diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 792c769b4fc0..62570359867c 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -18,7 +18,7 @@ use std::collections::BTreeMap; use std::ops::Deref; -use bytes::{Bytes, BytesMut}; +use bytes::Bytes; use pageserver_api::key::Key; use tokio::io::AsyncWriteExt; use tokio_epoll_uring::BoundedBuf; @@ -27,6 +27,7 @@ use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; +use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. @@ -158,7 +159,7 @@ impl std::fmt::Display for VectoredBlob { /// Return type of [`VectoredBlobReader::read_blobs`] pub struct VectoredBlobsBuf { /// Buffer for all blobs in this read - pub buf: BytesMut, + pub buf: IoBufferMut, /// Offsets into the buffer and metadata for all blobs in this read pub blobs: Vec, } @@ -446,7 +447,7 @@ impl<'a> VectoredBlobReader<'a> { pub async fn read_blobs( &self, read: &VectoredRead, - buf: BytesMut, + buf: IoBufferMut, ctx: &RequestContext, ) -> Result { assert!(read.size() > 0); @@ -921,7 +922,7 @@ mod tests { // Multiply by two (compressed data might need more space), and add a few bytes for the header let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; - let mut buf = BytesMut::with_capacity(reserved_bytes); + let mut buf = IoBufferMut::with_capacity(reserved_bytes); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { From e9d9663fcddf28dea09e1d103ed4bd6d54a3831e Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 7 Oct 2024 17:22:25 -0400 Subject: [PATCH 11/24] use aligned buffer for page cache Signed-off-by: Yuchen Liang --- pageserver/src/page_cache.rs | 16 ++++++----- pageserver/src/virtual_file.rs | 6 ++-- .../{dio.rs => aligned_buffer.rs} | 28 +++++++++++++++++++ .../owned_buffers_io/io_buf_aligned.rs | 4 ++- 4 files changed, 44 insertions(+), 10 deletions(-) rename pageserver/src/virtual_file/{dio.rs => aligned_buffer.rs} (94%) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index f386c825b848..79c4519dc28f 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -82,6 +82,7 @@ use once_cell::sync::OnceCell; use crate::{ context::RequestContext, metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics}, + virtual_file::{IoBufferMut, IoPageSlice}, }; static PAGE_CACHE: OnceCell = OnceCell::new(); @@ -144,7 +145,7 @@ struct SlotInner { key: Option, // for `coalesce_readers_permit` permit: std::sync::Mutex>, - buf: &'static mut [u8; PAGE_SZ], + buf: IoPageSlice<'static>, } impl Slot { @@ -234,13 +235,13 @@ impl std::ops::Deref for PageReadGuard<'_> { type Target = [u8; PAGE_SZ]; fn deref(&self) -> &Self::Target { - self.slot_guard.buf + self.slot_guard.buf.deref() } } impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> { fn as_ref(&self) -> &[u8; PAGE_SZ] { - self.slot_guard.buf + self.slot_guard.buf.as_ref() } } @@ -266,7 +267,7 @@ enum PageWriteGuardState<'i> { impl std::ops::DerefMut for PageWriteGuard<'_> { fn deref_mut(&mut self) -> &mut Self::Target { match &mut self.state { - PageWriteGuardState::Invalid { inner, _permit } => inner.buf, + PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref_mut(), PageWriteGuardState::Downgraded => unreachable!(), } } @@ -277,7 +278,7 @@ impl std::ops::Deref for PageWriteGuard<'_> { fn deref(&self) -> &Self::Target { match &self.state { - PageWriteGuardState::Invalid { inner, _permit } => inner.buf, + PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref(), PageWriteGuardState::Downgraded => unreachable!(), } } @@ -643,16 +644,17 @@ impl PageCache { // We could use Vec::leak here, but that potentially also leaks // uninitialized reserved capacity. With into_boxed_slice and Box::leak // this is avoided. - let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice()); + let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak(); let size_metrics = &crate::metrics::PAGE_CACHE_SIZE; size_metrics.max_bytes.set_page_sz(num_pages); size_metrics.current_bytes_immutable.set_page_sz(0); let slots = page_buffer + // Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned. .chunks_exact_mut(PAGE_SZ) .map(|chunk| { - let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap(); + let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) }; Slot { inner: tokio::sync::RwLock::new(SlotInner { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 75a355154e46..59f2e5afa8b5 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -44,7 +44,7 @@ pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; -pub(crate) mod dio; +pub(crate) mod aligned_buffer; pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. @@ -1359,7 +1359,9 @@ pub(crate) const fn get_io_buffer_alignment() -> usize { DEFAULT_IO_BUFFER_ALIGNMENT } -pub(crate) type IoBufferMut = dio::AlignedBufferMut<{ get_io_buffer_alignment() }>; +pub(crate) type IoBufferMut = aligned_buffer::AlignedBufferMut<{ get_io_buffer_alignment() }>; +pub(crate) type IoPageSlice<'a> = + aligned_buffer::AlignedSlice<'a, { get_io_buffer_alignment() }, PAGE_SZ>; static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); diff --git a/pageserver/src/virtual_file/dio.rs b/pageserver/src/virtual_file/aligned_buffer.rs similarity index 94% rename from pageserver/src/virtual_file/dio.rs rename to pageserver/src/virtual_file/aligned_buffer.rs index 16497c8951f4..18e1d2937be9 100644 --- a/pageserver/src/virtual_file/dio.rs +++ b/pageserver/src/virtual_file/aligned_buffer.rs @@ -23,6 +23,34 @@ pub struct AlignedBufferMut { len: usize, } +pub struct AlignedSlice<'a, const ALIGN: usize, const N: usize>(&'a mut [u8; N]); + +impl<'a, const ALIGN: usize, const N: usize> AlignedSlice<'a, ALIGN, N> { + pub unsafe fn new_unchecked(buf: &'a mut [u8; N]) -> Self { + AlignedSlice(buf) + } +} + +impl<'a, const ALIGN: usize, const N: usize> Deref for AlignedSlice<'a, ALIGN, N> { + type Target = [u8; N]; + + fn deref(&self) -> &Self::Target { + self.0 + } +} + +impl<'a, const ALIGN: usize, const N: usize> DerefMut for AlignedSlice<'a, ALIGN, N> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.0 + } +} + +impl<'a, const ALIGN: usize, const N: usize> AsRef<[u8; N]> for AlignedSlice<'a, ALIGN, N> { + fn as_ref(&self) -> &[u8; N] { + &self.0 + } +} + impl AlignedBufferMut { /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment. /// diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs index d79287fa69e7..cc3ad18a2109 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -2,8 +2,10 @@ use tokio_epoll_uring::IoBufMut; -use crate::virtual_file::IoBufferMut; +use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf}; pub(crate) trait IoBufAlignedMut: IoBufMut {} impl IoBufAlignedMut for IoBufferMut {} + +impl<'a> IoBufAlignedMut for PageWriteGuardBuf {} From 84e2242673b94a7ab40f5328fa75d1b194470fa1 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 7 Oct 2024 19:58:28 -0400 Subject: [PATCH 12/24] use aligned buffer for inmemory layer Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 21 +++++++++++-------- .../tenant/storage_layer/inmemory_layer.rs | 6 +++--- pageserver/src/virtual_file/aligned_buffer.rs | 20 ++++++++++++++++++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index a62a47f9a760..1e2e1aaf46f8 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -9,7 +9,7 @@ use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; use crate::virtual_file::owned_buffers_io::write::Buffer; -use crate::virtual_file::{self, owned_buffers_io, VirtualFile}; +use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; use bytes::BytesMut; use camino::Utf8PathBuf; use num_traits::Num; @@ -107,15 +107,18 @@ impl EphemeralFile { self.page_cache_file_id } - pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + pub(crate) async fn load_to_io_buf( + &self, + ctx: &RequestContext, + ) -> Result { let size = self.len().into_usize(); - let vec = Vec::with_capacity(size); - let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?; + let buf = IoBufferMut::with_capacity(size); + let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?; assert_eq!(nread, size); - let vec = slice.into_inner(); - assert_eq!(vec.len(), nread); - assert_eq!(vec.capacity(), size, "we shouldn't be reallocating"); - Ok(vec) + let buf = slice.into_inner(); + assert_eq!(buf.len(), nread); + assert_eq!(buf.capacity(), size, "we shouldn't be reallocating"); + Ok(buf) } /// Returns the offset at which the first byte of the input was written, for use @@ -385,7 +388,7 @@ mod tests { // assert the state is as this test expects it to be assert_eq!( - &file.load_to_vec(&ctx).await.unwrap(), + &file.load_to_io_buf(&ctx).await.unwrap(), &content[0..cap + cap / 2] ); let md = file diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index e487bee1f2d9..0bce26182947 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -809,9 +809,9 @@ impl InMemoryLayer { match l0_flush_global_state { l0_flush::Inner::Direct { .. } => { - let file_contents: Vec = inner.file.load_to_vec(ctx).await?; - - let file_contents = Bytes::from(file_contents); + let file_contents = inner.file.load_to_io_buf(ctx).await?; + // TODO(yuchen): see ways to avoid this copy. + let file_contents = Bytes::copy_from_slice(&file_contents); for (key, vec_map) in inner.index.iter() { // Write all page versions diff --git a/pageserver/src/virtual_file/aligned_buffer.rs b/pageserver/src/virtual_file/aligned_buffer.rs index 18e1d2937be9..7cbec0cb1fba 100644 --- a/pageserver/src/virtual_file/aligned_buffer.rs +++ b/pageserver/src/virtual_file/aligned_buffer.rs @@ -11,12 +11,14 @@ use std::{ use bytes::buf::UninitSlice; +#[derive(Debug)] struct IoBufferPtr(*mut u8); // SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. unsafe impl Send for IoBufferPtr {} /// An aligned buffer type used for I/O. +#[derive(Debug)] pub struct AlignedBufferMut { ptr: IoBufferPtr, capacity: usize, @@ -252,6 +254,24 @@ impl DerefMut for AlignedBufferMut { } } +impl AsRef<[u8]> for AlignedBufferMut { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl AsMut<[u8]> for AlignedBufferMut { + fn as_mut(&mut self) -> &mut [u8] { + self.as_mut_slice() + } +} + +impl PartialEq<[u8]> for AlignedBufferMut { + fn eq(&self, other: &[u8]) -> bool { + self.as_slice().eq(other) + } +} + /// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized. unsafe impl bytes::BufMut for AlignedBufferMut { #[inline] From 1c61b68c3ba37c67347582c2226fe386d405216d Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 8 Oct 2024 17:25:42 -0400 Subject: [PATCH 13/24] use aligned buffer marker trait Signed-off-by: Yuchen Liang --- pageserver/src/tenant/block_io.rs | 6 ++-- pageserver/src/tenant/ephemeral_file.rs | 7 +++-- .../inmemory_layer/vectored_dio_read.rs | 23 +++++++-------- pageserver/src/virtual_file.rs | 28 +++++++++++-------- pageserver/src/virtual_file/aligned_buffer.rs | 8 ++++++ .../owned_buffers_io/io_buf_aligned.rs | 2 +- 6 files changed, 45 insertions(+), 29 deletions(-) diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 3afa3a86b948..68a27fbc2065 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -5,6 +5,8 @@ use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; +#[cfg(test)] +use crate::virtual_file::IoBufferMut; use crate::virtual_file::VirtualFile; use bytes::Bytes; use std::ops::Deref; @@ -40,7 +42,7 @@ pub enum BlockLease<'a> { #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), #[cfg(test)] - Vec(Vec), + IoBufferMut(IoBufferMut), } impl From> for BlockLease<'static> { @@ -67,7 +69,7 @@ impl<'a> Deref for BlockLease<'a> { #[cfg(test)] BlockLease::Arc(v) => v.deref(), #[cfg(test)] - BlockLease::Vec(v) => { + BlockLease::IoBufferMut(v) => { TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ") } } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 1e2e1aaf46f8..de0abab4c0c7 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -6,6 +6,7 @@ use crate::config::PageServerConf; use crate::context::RequestContext; use crate::page_cache; use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; +use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; use crate::virtual_file::owned_buffers_io::write::Buffer; @@ -161,7 +162,7 @@ impl EphemeralFile { } impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { - async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>( + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( &'b self, start: u64, dst: tokio_epoll_uring::Slice, @@ -348,7 +349,7 @@ mod tests { assert!(file.len() as usize == write_nbytes); for i in 0..write_nbytes { assert_eq!(value_offsets[i], i.into_u64()); - let buf = Vec::with_capacity(1); + let buf = IoBufferMut::with_capacity(1); let (buf_slice, nread) = file .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx) .await @@ -443,7 +444,7 @@ mod tests { let (buf, nread) = file .read_exact_at_eof_ok( start.into_u64(), - Vec::with_capacity(len).slice_full(), + IoBufferMut::with_capacity(len).slice_full(), ctx, ) .await diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index 0683e15659dc..a4bb3a6bfc5d 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -9,6 +9,7 @@ use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; use crate::{ assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, context::RequestContext, + virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut}, }; /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. @@ -24,7 +25,7 @@ pub trait File: Send { /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. /// /// No guarantees are made about the remaining bytes in `dst` in case of a short read. - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( &'b self, start: u64, dst: Slice, @@ -227,7 +228,7 @@ where // Execute physical reads and fill the logical read buffers // TODO: pipelined reads; prefetch; - let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE); + let get_io_buffer = |nchunks| IoBufferMut::with_capacity(nchunks * DIO_CHUNK_SIZE); for PhysicalRead { start_chunk_no, nchunks, @@ -459,7 +460,7 @@ mod tests { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let file = InMemoryFile::new_random(10); let test_read = |pos, len| { - let buf = vec![0; len]; + let buf = IoBufferMut::with_capacity_zeroed(len); let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx); use futures::FutureExt; let (slice, nread) = fut @@ -470,9 +471,9 @@ mod tests { buf.truncate(nread); buf }; - assert_eq!(test_read(0, 1), &file.content[0..1]); - assert_eq!(test_read(1, 2), &file.content[1..3]); - assert_eq!(test_read(9, 2), &file.content[9..]); + assert_eq!(&test_read(0, 1), &file.content[0..1]); + assert_eq!(&test_read(1, 2), &file.content[1..3]); + assert_eq!(&test_read(9, 2), &file.content[9..]); assert!(test_read(10, 2).is_empty()); assert!(test_read(11, 2).is_empty()); } @@ -609,7 +610,7 @@ mod tests { } impl<'x> File for RecorderFile<'x> { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( &'b self, start: u64, dst: Slice, @@ -782,7 +783,7 @@ mod tests { 2048, 1024 => Err("foo".to_owned()), }; - let buf = Vec::with_capacity(512); + let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(0, buf.slice_full(), &ctx) .await @@ -790,7 +791,7 @@ mod tests { assert_eq!(nread, 512); assert_eq!(&buf.into_inner()[..nread], &[0; 512]); - let buf = Vec::with_capacity(512); + let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(512, buf.slice_full(), &ctx) .await @@ -798,7 +799,7 @@ mod tests { assert_eq!(nread, 512); assert_eq!(&buf.into_inner()[..nread], &[1; 512]); - let buf = Vec::with_capacity(512); + let buf = IoBufferMut::with_capacity(512); let (buf, nread) = mock_file .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx) .await @@ -806,7 +807,7 @@ mod tests { assert_eq!(nread, 10); assert_eq!(&buf.into_inner()[..nread], &[2; 10]); - let buf = Vec::with_capacity(1024); + let buf = IoBufferMut::with_capacity(1024); let err = mock_file .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx) .await diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 59f2e5afa8b5..11e3ae9c3f65 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -18,6 +18,7 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; +use owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; @@ -198,7 +199,7 @@ impl VirtualFile { ctx: &RequestContext, ) -> Result, Error> where - Buf: IoBufMut + Send, + Buf: IoBufAlignedMut + Send, { self.inner.read_exact_at(slice, offset, ctx).await } @@ -773,7 +774,7 @@ impl VirtualFileInner { ctx: &RequestContext, ) -> Result, Error> where - Buf: IoBufMut + Send, + Buf: IoBufAlignedMut + Send, { let assert_we_return_original_bounds = if cfg!(debug_assertions) { Some((slice.stable_ptr() as usize, slice.bytes_total())) @@ -1224,12 +1225,14 @@ impl VirtualFileInner { ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let slice = Vec::with_capacity(PAGE_SZ).slice_full(); + let slice = IoBufferMut::with_capacity(PAGE_SZ).slice_full(); assert_eq!(slice.bytes_total(), PAGE_SZ); let slice = self .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; - Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner())) + Ok(crate::tenant::block_io::BlockLease::IoBufferMut( + slice.into_inner(), + )) } async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { @@ -1401,10 +1404,10 @@ mod tests { impl MaybeVirtualFile { async fn read_exact_at( &self, - mut slice: tokio_epoll_uring::Slice>, + mut slice: tokio_epoll_uring::Slice, offset: u64, ctx: &RequestContext, - ) -> Result>, Error> { + ) -> Result, Error> { match self { MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, MaybeVirtualFile::File(file) => { @@ -1472,12 +1475,13 @@ mod tests { len: usize, ctx: &RequestContext, ) -> Result { - let slice = Vec::with_capacity(len).slice_full(); + let slice = IoBufferMut::with_capacity(len).slice_full(); assert_eq!(slice.bytes_total(), len); let slice = self.read_exact_at(slice, pos, ctx).await?; - let vec = slice.into_inner(); - assert_eq!(vec.len(), len); - Ok(String::from_utf8(vec).unwrap()) + let buf = slice.into_inner(); + assert_eq!(buf.len(), len); + + Ok(String::from_utf8(buf.to_vec()).unwrap()) } } @@ -1701,7 +1705,7 @@ mod tests { let files = files.clone(); let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error); let hdl = rt.spawn(async move { - let mut buf = vec![0u8; SIZE]; + let mut buf = IoBufferMut::with_capacity_zeroed(SIZE); let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; @@ -1710,7 +1714,7 @@ mod tests { .await .unwrap() .into_inner(); - assert!(buf == SAMPLE); + assert!(&buf[..] == SAMPLE); } }); hdls.push(hdl); diff --git a/pageserver/src/virtual_file/aligned_buffer.rs b/pageserver/src/virtual_file/aligned_buffer.rs index 7cbec0cb1fba..6d84c2c36a73 100644 --- a/pageserver/src/virtual_file/aligned_buffer.rs +++ b/pageserver/src/virtual_file/aligned_buffer.rs @@ -203,6 +203,14 @@ impl AlignedBufferMut { self.capacity = cap; } + /// Shortens the buffer, keeping the first len bytes. + pub fn truncate(&mut self, len: usize) { + if len > self.len { + return; + } + self.len = len; + } + /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8]. pub fn leak<'a>(self) -> &'a mut [u8] { let mut buf = ManuallyDrop::new(self); diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs index cc3ad18a2109..7c17cbeb56f7 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -4,7 +4,7 @@ use tokio_epoll_uring::IoBufMut; use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf}; -pub(crate) trait IoBufAlignedMut: IoBufMut {} +pub trait IoBufAlignedMut: IoBufMut {} impl IoBufAlignedMut for IoBufferMut {} From 62722e8559818ba4e31427c497b6505c771b1ec0 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Wed, 9 Oct 2024 09:39:24 -0400 Subject: [PATCH 14/24] fix clippy Signed-off-by: Yuchen Liang --- pageserver/src/page_cache.rs | 2 +- pageserver/src/virtual_file.rs | 3 +-- pageserver/src/virtual_file/aligned_buffer.rs | 2 +- pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 79c4519dc28f..45bf02362aa9 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -651,9 +651,9 @@ impl PageCache { size_metrics.current_bytes_immutable.set_page_sz(0); let slots = page_buffer - // Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned. .chunks_exact_mut(PAGE_SZ) .map(|chunk| { + // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned. let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) }; Slot { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 3dd4ea4b96e3..9243b1905452 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1366,7 +1366,6 @@ pub(crate) type IoBufferMut = aligned_buffer::AlignedBufferMut<{ get_io_buffer_a pub(crate) type IoPageSlice<'a> = aligned_buffer::AlignedSlice<'a, { get_io_buffer_alignment() }, PAGE_SZ>; - static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); pub(crate) fn set_io_mode(mode: IoMode) { @@ -1715,7 +1714,7 @@ mod tests { .await .unwrap() .into_inner(); - assert!(&buf[..] == SAMPLE); + assert!(buf[..] == SAMPLE); } }); hdls.push(hdl); diff --git a/pageserver/src/virtual_file/aligned_buffer.rs b/pageserver/src/virtual_file/aligned_buffer.rs index 6d84c2c36a73..99cb4a4c8db9 100644 --- a/pageserver/src/virtual_file/aligned_buffer.rs +++ b/pageserver/src/virtual_file/aligned_buffer.rs @@ -49,7 +49,7 @@ impl<'a, const ALIGN: usize, const N: usize> DerefMut for AlignedSlice<'a, ALIGN impl<'a, const ALIGN: usize, const N: usize> AsRef<[u8; N]> for AlignedSlice<'a, ALIGN, N> { fn as_ref(&self) -> &[u8; N] { - &self.0 + self.0 } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs index 7c17cbeb56f7..d26cbbf948dc 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -8,4 +8,4 @@ pub trait IoBufAlignedMut: IoBufMut {} impl IoBufAlignedMut for IoBufferMut {} -impl<'a> IoBufAlignedMut for PageWriteGuardBuf {} +impl IoBufAlignedMut for PageWriteGuardBuf {} From 8f9679ce9527f61c42d5548cf46fffd00066f201 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Wed, 9 Oct 2024 10:30:25 -0400 Subject: [PATCH 15/24] refactor aligned buffer Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file.rs | 8 +- .../owned_buffers_io/aligned_buffer.rs | 9 + .../aligned_buffer/alignment.rs | 26 +++ .../owned_buffers_io/aligned_buffer/buffer.rs | 108 +++++++++ .../aligned_buffer/buffer_mut.rs} | 201 ++++------------- .../owned_buffers_io/aligned_buffer/raw.rs | 213 ++++++++++++++++++ .../owned_buffers_io/aligned_buffer/slice.rs | 40 ++++ 7 files changed, 443 insertions(+), 162 deletions(-) create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs rename pageserver/src/virtual_file/{aligned_buffer.rs => owned_buffers_io/aligned_buffer/buffer_mut.rs} (59%) create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 9243b1905452..2d9f4218e740 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -18,6 +18,7 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; +use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign}; use owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; @@ -45,7 +46,6 @@ pub(crate) use api::IoMode; pub(crate) use io_engine::IoEngineKind; pub(crate) use metadata::Metadata; pub(crate) use open_options::*; -pub(crate) mod aligned_buffer; pub(crate) mod owned_buffers_io { //! Abstractions for IO with owned buffers. @@ -57,6 +57,7 @@ pub(crate) mod owned_buffers_io { //! but for the time being we're proving out the primitives in the neon.git repo //! for faster iteration. + pub(crate) mod aligned_buffer; pub(crate) mod io_buf_aligned; pub(crate) mod io_buf_ext; pub(crate) mod slice; @@ -1362,9 +1363,10 @@ pub(crate) const fn get_io_buffer_alignment() -> usize { DEFAULT_IO_BUFFER_ALIGNMENT } -pub(crate) type IoBufferMut = aligned_buffer::AlignedBufferMut<{ get_io_buffer_alignment() }>; +pub(crate) type IoBufferMut = AlignedBufferMut>; +// pub(crate) type IoBuffer = AlignedBuffer>; pub(crate) type IoPageSlice<'a> = - aligned_buffer::AlignedSlice<'a, { get_io_buffer_alignment() }, PAGE_SZ>; + AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>; static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8); diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs new file mode 100644 index 000000000000..8ffc29b93d33 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs @@ -0,0 +1,9 @@ +pub mod alignment; +pub mod buffer; +pub mod buffer_mut; +pub mod raw; +pub mod slice; + +pub use alignment::*; +pub use buffer_mut::AlignedBufferMut; +pub use slice::AlignedSlice; diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs new file mode 100644 index 000000000000..933b78a13b70 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs @@ -0,0 +1,26 @@ +pub trait Alignment: std::marker::Unpin + 'static { + /// Returns the required alignments. + fn align(&self) -> usize; +} + +/// Alignment at compile time. +#[derive(Debug)] +pub struct ConstAlign; + +impl Alignment for ConstAlign { + fn align(&self) -> usize { + A + } +} + +/// Alignment at run time. +#[derive(Debug)] +pub struct RuntimeAlign { + align: usize, +} + +impl Alignment for RuntimeAlign { + fn align(&self) -> usize { + self.align + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs new file mode 100644 index 000000000000..4669a1b353df --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -0,0 +1,108 @@ +use std::{ + ops::{Deref, Range}, + sync::Arc, +}; + +use super::{alignment::Alignment, raw::RawAlignedBuffer}; + +pub struct AlignedBuffer { + /// Shared raw buffer. + raw: Arc>, + /// Range that specifies the current slice. + range: Range, +} + +impl AlignedBuffer { + /// Creates an immutable `IoBuffer` from the raw buffer + pub(super) fn from_raw(raw: RawAlignedBuffer, range: Range) -> Self { + AlignedBuffer { + raw: Arc::new(raw), + range, + } + } + + /// Returns the number of bytes in the buffer, also referred to as its 'length'. + #[inline] + pub fn len(&self) -> usize { + self.range.len() + } + + /// Returns the alignment of the buffer. + #[inline] + pub fn align(&self) -> usize { + self.raw.align() + } + + #[inline] + fn as_ptr(&self) -> *const u8 { + unsafe { self.raw.as_ptr().add(self.range.start) } + } + + /// Extracts a slice containing the entire buffer. + /// + /// Equivalent to `&s[..]`. + #[inline] + fn as_slice(&self) -> &[u8] { + &self.raw.as_slice()[self.range.start..self.range.end] + } + + /// Returns a slice of self for the index range `[begin..end)`. + pub fn slice(&self, begin: usize, end: usize) -> Self { + let len = self.len(); + + assert!( + begin <= end, + "range start must not be greater than end: {:?} <= {:?}", + begin, + end, + ); + assert!( + end <= len, + "range end out of bounds: {:?} <= {:?}", + end, + len, + ); + + let begin = self.range.start + begin; + let end = self.range.start + end; + + AlignedBuffer { + raw: Arc::clone(&self.raw), + range: begin..end, + } + } +} + +impl Deref for AlignedBuffer { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + self.as_slice() + } +} + +impl AsRef<[u8]> for AlignedBuffer { + fn as_ref(&self) -> &[u8] { + self.as_slice() + } +} + +impl PartialEq<[u8]> for AlignedBuffer { + fn eq(&self, other: &[u8]) -> bool { + self.as_slice().eq(other) + } +} + +unsafe impl tokio_epoll_uring::IoBuf for AlignedBuffer { + fn stable_ptr(&self) -> *const u8 { + self.as_ptr() + } + + fn bytes_init(&self) -> usize { + self.len() + } + + fn bytes_total(&self) -> usize { + self.len() + } +} diff --git a/pageserver/src/virtual_file/aligned_buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs similarity index 59% rename from pageserver/src/virtual_file/aligned_buffer.rs rename to pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index 99cb4a4c8db9..ab617ad32280 100644 --- a/pageserver/src/virtual_file/aligned_buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -1,59 +1,17 @@ -#![allow(unused)] - -use core::slice; -use std::{ - alloc::{self, Layout}, - cmp, - mem::{ManuallyDrop, MaybeUninit}, - ops::{Deref, DerefMut}, - ptr::{addr_of_mut, NonNull}, -}; - -use bytes::buf::UninitSlice; - -#[derive(Debug)] -struct IoBufferPtr(*mut u8); +use std::ops::{Deref, DerefMut}; -// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. -unsafe impl Send for IoBufferPtr {} +use super::{ + alignment::{Alignment, ConstAlign}, + buffer::AlignedBuffer, + raw::RawAlignedBuffer, +}; -/// An aligned buffer type used for I/O. #[derive(Debug)] -pub struct AlignedBufferMut { - ptr: IoBufferPtr, - capacity: usize, - len: usize, -} - -pub struct AlignedSlice<'a, const ALIGN: usize, const N: usize>(&'a mut [u8; N]); - -impl<'a, const ALIGN: usize, const N: usize> AlignedSlice<'a, ALIGN, N> { - pub unsafe fn new_unchecked(buf: &'a mut [u8; N]) -> Self { - AlignedSlice(buf) - } +pub struct AlignedBufferMut { + raw: RawAlignedBuffer, } -impl<'a, const ALIGN: usize, const N: usize> Deref for AlignedSlice<'a, ALIGN, N> { - type Target = [u8; N]; - - fn deref(&self) -> &Self::Target { - self.0 - } -} - -impl<'a, const ALIGN: usize, const N: usize> DerefMut for AlignedSlice<'a, ALIGN, N> { - fn deref_mut(&mut self) -> &mut Self::Target { - self.0 - } -} - -impl<'a, const ALIGN: usize, const N: usize> AsRef<[u8; N]> for AlignedSlice<'a, ALIGN, N> { - fn as_ref(&self) -> &[u8; N] { - self.0 - } -} - -impl AlignedBufferMut { +impl AlignedBufferMut> { /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment. /// /// The buffer will be able to hold at most `capacity` elements and will never resize. @@ -70,21 +28,8 @@ impl AlignedBufferMut { /// must not overflow isize (i.e., the rounded value must be /// less than or equal to `isize::MAX`). pub fn with_capacity(capacity: usize) -> Self { - let layout = Layout::from_size_align(capacity, ALIGN).expect("Invalid layout"); - - // SAFETY: Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout. - let ptr = unsafe { - let ptr = alloc::alloc(layout); - if ptr.is_null() { - alloc::handle_alloc_error(layout); - } - IoBufferPtr(ptr) - }; - AlignedBufferMut { - ptr, - capacity, - len: 0, + raw: RawAlignedBuffer::with_capacity(capacity), } } @@ -93,43 +38,45 @@ impl AlignedBufferMut { use bytes::BufMut; let mut buf = Self::with_capacity(capacity); buf.put_bytes(0, capacity); - buf.len = capacity; + // `put_bytes` filled the entire buffer. + unsafe { buf.set_len(capacity) }; buf } +} +impl AlignedBufferMut { /// Returns the total number of bytes the buffer can hold. #[inline] pub fn capacity(&self) -> usize { - self.capacity + self.raw.capacity() } /// Returns the alignment of the buffer. #[inline] - pub const fn align(&self) -> usize { - ALIGN + pub fn align(&self) -> usize { + self.raw.align() } /// Returns the number of bytes in the buffer, also referred to as its 'length'. #[inline] pub fn len(&self) -> usize { - self.len + self.raw.len() } /// Force the length of the buffer to `new_len`. #[inline] unsafe fn set_len(&mut self, new_len: usize) { - debug_assert!(new_len <= self.capacity()); - self.len = new_len; + self.raw.set_len(new_len) } #[inline] fn as_ptr(&self) -> *const u8 { - self.ptr.0 + self.raw.as_ptr() } #[inline] fn as_mut_ptr(&mut self) -> *mut u8 { - self.ptr.0 + self.raw.as_mut_ptr() } /// Extracts a slice containing the entire buffer. @@ -137,22 +84,20 @@ impl AlignedBufferMut { /// Equivalent to `&s[..]`. #[inline] fn as_slice(&self) -> &[u8] { - // SAFETY: The pointer is valid and `len` bytes are initialized. - unsafe { slice::from_raw_parts(self.as_ptr(), self.len) } + self.raw.as_slice() } /// Extracts a mutable slice of the entire buffer. /// /// Equivalent to `&mut s[..]`. fn as_mut_slice(&mut self) -> &mut [u8] { - // SAFETY: The pointer is valid and `len` bytes are initialized. - unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) } + self.raw.as_mut_slice() } /// Drops the all the contents of the buffer, setting its length to `0`. #[inline] pub fn clear(&mut self) { - self.len = 0; + self.raw.clear() } /// Reserves capacity for at least `additional` more bytes to be inserted @@ -165,90 +110,26 @@ impl AlignedBufferMut { /// /// Panics if the new capacity exceeds `isize::MAX` _bytes_. pub fn reserve(&mut self, additional: usize) { - if additional > self.capacity() - self.len() { - self.reserve_inner(additional); - } - } - - fn reserve_inner(&mut self, additional: usize) { - let Some(required_cap) = self.len().checked_add(additional) else { - capacity_overflow() - }; - - let old_capacity = self.capacity(); - let align = self.align(); - // This guarantees exponential growth. The doubling cannot overflow - // because `cap <= isize::MAX` and the type of `cap` is `usize`. - let cap = cmp::max(old_capacity * 2, required_cap); - - if !is_valid_alloc(cap) { - capacity_overflow() - } - let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout"); - - let old_ptr = self.as_mut_ptr(); - - // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout, - // and we panics on null pointer. - let (ptr, cap) = unsafe { - let old_layout = Layout::from_size_align_unchecked(old_capacity, align); - let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size()); - if ptr.is_null() { - alloc::handle_alloc_error(new_layout); - } - (IoBufferPtr(ptr), cap) - }; - - self.ptr = ptr; - self.capacity = cap; + self.raw.reserve(additional); } /// Shortens the buffer, keeping the first len bytes. pub fn truncate(&mut self, len: usize) { - if len > self.len { - return; - } - self.len = len; + self.raw.truncate(len); } /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8]. pub fn leak<'a>(self) -> &'a mut [u8] { - let mut buf = ManuallyDrop::new(self); - // SAFETY: leaking the buffer as intended. - unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) } + self.raw.leak() } -} - -fn capacity_overflow() -> ! { - panic!("capacity overflow") -} - -// We need to guarantee the following: -// * We don't ever allocate `> isize::MAX` byte-size objects. -// * We don't overflow `usize::MAX` and actually allocate too little. -// -// On 64-bit we just need to check for overflow since trying to allocate -// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add -// an extra guard for this in case we're running on a platform which can use -// all 4GB in user-space, e.g., PAE or x32. -#[inline] -fn is_valid_alloc(alloc_size: usize) -> bool { - !(usize::BITS < 64 && alloc_size > isize::MAX as usize) -} -impl Drop for AlignedBufferMut { - fn drop(&mut self) { - // SAFETY: memory was allocated with std::alloc::alloc with the same layout. - unsafe { - alloc::dealloc( - self.as_mut_ptr(), - Layout::from_size_align_unchecked(self.capacity, ALIGN), - ) - } + pub fn freeze(self) -> AlignedBuffer { + let len = self.len(); + AlignedBuffer::from_raw(self.raw, 0..len) } } -impl Deref for AlignedBufferMut { +impl Deref for AlignedBufferMut { type Target = [u8]; fn deref(&self) -> &Self::Target { @@ -256,32 +137,32 @@ impl Deref for AlignedBufferMut { } } -impl DerefMut for AlignedBufferMut { +impl DerefMut for AlignedBufferMut { fn deref_mut(&mut self) -> &mut Self::Target { self.as_mut_slice() } } -impl AsRef<[u8]> for AlignedBufferMut { +impl AsRef<[u8]> for AlignedBufferMut { fn as_ref(&self) -> &[u8] { self.as_slice() } } -impl AsMut<[u8]> for AlignedBufferMut { +impl AsMut<[u8]> for AlignedBufferMut { fn as_mut(&mut self) -> &mut [u8] { self.as_mut_slice() } } -impl PartialEq<[u8]> for AlignedBufferMut { +impl PartialEq<[u8]> for AlignedBufferMut { fn eq(&self, other: &[u8]) -> bool { self.as_slice().eq(other) } } /// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized. -unsafe impl bytes::BufMut for AlignedBufferMut { +unsafe impl bytes::BufMut for AlignedBufferMut { #[inline] fn remaining_mut(&self) -> usize { // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`. @@ -311,7 +192,9 @@ unsafe impl bytes::BufMut for AlignedBufferMut { // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be // valid for `cap - len` bytes. The subtraction will not underflow since // `len <= cap`. - unsafe { UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) } + unsafe { + bytes::buf::UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len) + } } } @@ -326,7 +209,7 @@ fn panic_advance(idx: usize, len: usize) -> ! { /// Safety: [`IoBufferMut`] has exclusive ownership of the io buffer, /// and the location remains stable even if [`Self`] is moved. -unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut { +unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut { fn stable_ptr(&self) -> *const u8 { self.as_ptr() } @@ -341,7 +224,7 @@ unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut tokio_epoll_uring::IoBufMut for AlignedBufferMut { +unsafe impl tokio_epoll_uring::IoBufMut for AlignedBufferMut { fn stable_mut_ptr(&mut self) -> *mut u8 { self.as_mut_ptr() } @@ -359,7 +242,7 @@ mod tests { use super::*; const ALIGN: usize = 4 * 1024; - type TestIoBufferMut = AlignedBufferMut; + type TestIoBufferMut = AlignedBufferMut>; #[test] fn test_with_capacity() { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs new file mode 100644 index 000000000000..b979606424d2 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -0,0 +1,213 @@ +use core::slice; +use std::{ + alloc::{self, Layout}, + cmp, + mem::ManuallyDrop, +}; + +use super::alignment::{Alignment, ConstAlign}; + +#[derive(Debug)] +struct AlignedBufferPtr(*mut u8); + +// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. +unsafe impl Send for AlignedBufferPtr {} + +/// An aligned buffer type used for I/O. +#[derive(Debug)] +pub struct RawAlignedBuffer { + ptr: AlignedBufferPtr, + capacity: usize, + len: usize, + align: A, +} + +impl RawAlignedBuffer> { + /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment. + /// + /// The buffer will be able to hold at most `capacity` elements and will never resize. + /// + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met: + /// * `align` must not be zero, + /// + /// * `align` must be a power of two, + /// + /// * `capacity`, when rounded up to the nearest multiple of `align`, + /// must not overflow isize (i.e., the rounded value must be + /// less than or equal to `isize::MAX`). + pub fn with_capacity(capacity: usize) -> Self { + let align = ConstAlign::; + let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout"); + + // SAFETY: Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout. + let ptr = unsafe { + let ptr = alloc::alloc(layout); + if ptr.is_null() { + alloc::handle_alloc_error(layout); + } + AlignedBufferPtr(ptr) + }; + + RawAlignedBuffer { + ptr, + capacity, + len: 0, + align, + } + } +} + +impl RawAlignedBuffer { + /// Returns the total number of bytes the buffer can hold. + #[inline] + pub fn capacity(&self) -> usize { + self.capacity + } + + /// Returns the alignment of the buffer. + #[inline] + pub fn align(&self) -> usize { + self.align.align() + } + + /// Returns the number of bytes in the buffer, also referred to as its 'length'. + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Force the length of the buffer to `new_len`. + #[inline] + pub unsafe fn set_len(&mut self, new_len: usize) { + debug_assert!(new_len <= self.capacity()); + self.len = new_len; + } + + #[inline] + pub fn as_ptr(&self) -> *const u8 { + self.ptr.0 + } + + #[inline] + pub fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr.0 + } + + /// Extracts a slice containing the entire buffer. + /// + /// Equivalent to `&s[..]`. + #[inline] + pub fn as_slice(&self) -> &[u8] { + // SAFETY: The pointer is valid and `len` bytes are initialized. + unsafe { slice::from_raw_parts(self.as_ptr(), self.len) } + } + + /// Extracts a mutable slice of the entire buffer. + /// + /// Equivalent to `&mut s[..]`. + pub fn as_mut_slice(&mut self) -> &mut [u8] { + // SAFETY: The pointer is valid and `len` bytes are initialized. + unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) } + } + + /// Drops the all the contents of the buffer, setting its length to `0`. + #[inline] + pub fn clear(&mut self) { + self.len = 0; + } + + /// Reserves capacity for at least `additional` more bytes to be inserted + /// in the given `IoBufferMut`. The collection may reserve more space to + /// speculatively avoid frequent reallocations. After calling `reserve`, + /// capacity will be greater than or equal to `self.len() + additional`. + /// Does nothing if capacity is already sufficient. + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` _bytes_. + pub fn reserve(&mut self, additional: usize) { + if additional > self.capacity() - self.len() { + self.reserve_inner(additional); + } + } + + fn reserve_inner(&mut self, additional: usize) { + let Some(required_cap) = self.len().checked_add(additional) else { + capacity_overflow() + }; + + let old_capacity = self.capacity(); + let align = self.align(); + // This guarantees exponential growth. The doubling cannot overflow + // because `cap <= isize::MAX` and the type of `cap` is `usize`. + let cap = cmp::max(old_capacity * 2, required_cap); + + if !is_valid_alloc(cap) { + capacity_overflow() + } + let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout"); + + let old_ptr = self.as_mut_ptr(); + + // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout, + // and we panics on null pointer. + let (ptr, cap) = unsafe { + let old_layout = Layout::from_size_align_unchecked(old_capacity, align); + let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size()); + if ptr.is_null() { + alloc::handle_alloc_error(new_layout); + } + (AlignedBufferPtr(ptr), cap) + }; + + self.ptr = ptr; + self.capacity = cap; + } + + /// Shortens the buffer, keeping the first len bytes. + pub fn truncate(&mut self, len: usize) { + if len > self.len { + return; + } + self.len = len; + } + + /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8]. + pub fn leak<'a>(self) -> &'a mut [u8] { + let mut buf = ManuallyDrop::new(self); + // SAFETY: leaking the buffer as intended. + unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) } + } +} + +fn capacity_overflow() -> ! { + panic!("capacity overflow") +} + +// We need to guarantee the following: +// * We don't ever allocate `> isize::MAX` byte-size objects. +// * We don't overflow `usize::MAX` and actually allocate too little. +// +// On 64-bit we just need to check for overflow since trying to allocate +// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add +// an extra guard for this in case we're running on a platform which can use +// all 4GB in user-space, e.g., PAE or x32. +#[inline] +fn is_valid_alloc(alloc_size: usize) -> bool { + !(usize::BITS < 64 && alloc_size > isize::MAX as usize) +} + +impl Drop for RawAlignedBuffer { + fn drop(&mut self) { + // SAFETY: memory was allocated with std::alloc::alloc with the same layout. + unsafe { + alloc::dealloc( + self.as_mut_ptr(), + Layout::from_size_align_unchecked(self.capacity, self.align.align()), + ) + } + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs new file mode 100644 index 000000000000..ee90746f87fe --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs @@ -0,0 +1,40 @@ +use std::ops::{Deref, DerefMut}; + +use super::alignment::{Alignment, ConstAlign}; + +/// Newtype for an aligned slice. +pub struct AlignedSlice<'a, const N: usize, A: Alignment> { + /// underlying byte slice + buf: &'a mut [u8; N], + /// alignment marker + _align: A, +} + +impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign> { + /// Create a new aligned slice from a mutable byte slice. The input must already satisify the alignment. + pub unsafe fn new_unchecked(buf: &'a mut [u8; N]) -> Self { + let _align = ConstAlign::; + assert_eq!(buf.as_ptr().align_offset(_align.align()), 0); + AlignedSlice { buf, _align } + } +} + +impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> { + type Target = [u8; N]; + + fn deref(&self) -> &Self::Target { + self.buf + } +} + +impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.buf + } +} + +impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> { + fn as_ref(&self) -> &[u8; N] { + &self.buf + } +} From 84b0902e6a6834434418146937d1c842cc346640 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Wed, 9 Oct 2024 12:32:52 -0400 Subject: [PATCH 16/24] fix clippy Signed-off-by: Yuchen Liang --- .../src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs | 2 ++ .../virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs | 2 +- .../src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 4669a1b353df..20eceebe8738 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -35,6 +35,7 @@ impl AlignedBuffer { #[inline] fn as_ptr(&self) -> *const u8 { + // SAFETY: `self.range.start` is guaranteed to be within [0, self.len()). unsafe { self.raw.as_ptr().add(self.range.start) } } @@ -93,6 +94,7 @@ impl PartialEq<[u8]> for AlignedBuffer { } } +/// SAFETY: the underlying buffer references a stable memory region. unsafe impl tokio_epoll_uring::IoBuf for AlignedBuffer { fn stable_ptr(&self) -> *const u8 { self.as_ptr() diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index ab617ad32280..283d816ed5fa 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -38,7 +38,7 @@ impl AlignedBufferMut> { use bytes::BufMut; let mut buf = Self::with_capacity(capacity); buf.put_bytes(0, capacity); - // `put_bytes` filled the entire buffer. + // SAFETY: `put_bytes` filled the entire buffer. unsafe { buf.set_len(capacity) }; buf } diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs index ee90746f87fe..6cecf34c1cd5 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs @@ -35,6 +35,6 @@ impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> { impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> { fn as_ref(&self) -> &[u8; N] { - &self.buf + self.buf } } From e3771776802885449e3049fc8a30f34b637de976 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Wed, 9 Oct 2024 12:39:19 -0400 Subject: [PATCH 17/24] use IoBuffer instead of Bytes for inmemory_layer put_bytes Signed-off-by: Yuchen Liang --- pageserver/src/tenant/storage_layer/inmemory_layer.rs | 5 ++--- pageserver/src/virtual_file.rs | 3 ++- .../src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs | 3 +++ pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 0bce26182947..2cb1304f4b59 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -14,7 +14,6 @@ use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; use anyhow::{anyhow, Context, Result}; -use bytes::Bytes; use camino::Utf8PathBuf; use pageserver_api::key::CompactKey; use pageserver_api::keyspace::KeySpace; @@ -811,7 +810,7 @@ impl InMemoryLayer { l0_flush::Inner::Direct { .. } => { let file_contents = inner.file.load_to_io_buf(ctx).await?; // TODO(yuchen): see ways to avoid this copy. - let file_contents = Bytes::copy_from_slice(&file_contents); + let file_contents = file_contents.freeze(); for (key, vec_map) in inner.index.iter() { // Write all page versions @@ -825,7 +824,7 @@ impl InMemoryLayer { len, will_init, } = entry; - let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize); + let buf = file_contents.slice(pos as usize, (pos + len) as usize); let (_buf, res) = delta_layer_writer .put_value_bytes( Key::from_compact(*key), diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 2d9f4218e740..1a0f96d0f54b 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -18,6 +18,7 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; +use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign}; use owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use owned_buffers_io::io_buf_ext::FullSlice; @@ -1364,7 +1365,7 @@ pub(crate) const fn get_io_buffer_alignment() -> usize { } pub(crate) type IoBufferMut = AlignedBufferMut>; -// pub(crate) type IoBuffer = AlignedBuffer>; +pub(crate) type IoBuffer = AlignedBuffer>; pub(crate) type IoPageSlice<'a> = AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>; diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs index b979606424d2..d9ef8d72f529 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -13,6 +13,9 @@ struct AlignedBufferPtr(*mut u8); // SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. unsafe impl Send for AlignedBufferPtr {} +// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. +unsafe impl Sync for AlignedBufferPtr {} + /// An aligned buffer type used for I/O. #[derive(Debug)] pub struct RawAlignedBuffer { diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index cd1917b580fa..c3940cf6cea2 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -1,6 +1,6 @@ //! See [`FullSlice`]. -use crate::virtual_file::IoBufferMut; +use crate::virtual_file::{IoBuffer, IoBufferMut}; use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; @@ -78,3 +78,4 @@ impl_io_buf_ext!(Bytes); impl_io_buf_ext!(BytesMut); impl_io_buf_ext!(Vec); impl_io_buf_ext!(IoBufferMut); +impl_io_buf_ext!(IoBuffer); From 929c8d42d39cede55e9f52f78452c461c18efd5b Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Wed, 9 Oct 2024 17:36:30 +0000 Subject: [PATCH 18/24] use io mode from config Signed-off-by: Yuchen Liang --- pageserver/benches/bench_ingest.rs | 6 +++++- pageserver/ctl/src/layer_map_analyzer.rs | 7 ++++++- pageserver/ctl/src/layers.rs | 13 +++++++++++-- pageserver/ctl/src/main.rs | 8 ++++++-- pageserver/src/bin/pageserver.rs | 6 +++++- pageserver/src/virtual_file.rs | 3 ++- 6 files changed, 35 insertions(+), 8 deletions(-) diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 821c8008a950..d98b23acced5 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) { let conf: &'static PageServerConf = Box::leak(Box::new( pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), )); - virtual_file::init(16384, virtual_file::io_engine_for_bench()); + virtual_file::init( + 16384, + virtual_file::io_engine_for_bench(), + conf.virtual_file_io_mode, + ); page_cache::init(conf.page_cache_size); { diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 151b94cf62d3..7dd2a5d05cbf 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -7,6 +7,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use pageserver::context::{DownloadBehavior, RequestContext}; use pageserver::task_mgr::TaskKind; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; +use pageserver::virtual_file::api::IoMode; use std::cmp::Ordering; use std::collections::BinaryHeap; use std::ops::Range; @@ -152,7 +153,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + IoMode::preferred(), + ); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index fd948bf2efed..c0b2b6ae890a 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -11,6 +11,7 @@ use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary}; use pageserver::tenant::storage_layer::{delta_layer, image_layer}; use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer}; use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; +use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; use pageserver::{ repository::{Key, KEY_SIZE}, @@ -59,7 +60,11 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + IoMode::preferred(), + ); page_cache::init(100); let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); @@ -190,7 +195,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + IoMode::preferred(), + ); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index c96664d346cb..f506caec5b06 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -24,7 +24,7 @@ use pageserver::{ page_cache, task_mgr::TaskKind, tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, - virtual_file, + virtual_file::{self, api::IoMode}, }; use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; @@ -205,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + IoMode::preferred(), + ); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index f71a3d26531c..c6659345f94c 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -167,7 +167,11 @@ fn main() -> anyhow::Result<()> { let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); + virtual_file::init( + conf.max_file_descriptors, + conf.virtual_file_io_engine, + conf.virtual_file_io_mode, + ); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 1a0f96d0f54b..be644fb4cd17 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1332,10 +1332,11 @@ impl OpenFiles { /// server startup. /// #[cfg(not(test))] -pub fn init(num_slots: usize, engine: IoEngineKind) { +pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } + set_io_mode(mode); io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } From 156237d55332a9a5567e99716e9d78e4e743a5d0 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 11 Oct 2024 17:30:49 -0400 Subject: [PATCH 19/24] add more comments Signed-off-by: Yuchen Liang --- .../src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs | 1 + .../virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs | 3 ++- .../src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 20eceebe8738..6b5535091033 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -5,6 +5,7 @@ use std::{ use super::{alignment::Alignment, raw::RawAlignedBuffer}; +/// An shared, immutable aligned buffer type. pub struct AlignedBuffer { /// Shared raw buffer. raw: Arc>, diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index 283d816ed5fa..c878ab14f878 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -6,6 +6,7 @@ use super::{ raw::RawAlignedBuffer, }; +/// A mutable aligned buffer type. #[derive(Debug)] pub struct AlignedBufferMut { raw: RawAlignedBuffer, @@ -207,7 +208,7 @@ fn panic_advance(idx: usize, len: usize) -> ! { ); } -/// Safety: [`IoBufferMut`] has exclusive ownership of the io buffer, +/// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer, /// and the location remains stable even if [`Self`] is moved. unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut { fn stable_ptr(&self) -> *const u8 { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs index d9ef8d72f529..6c26dec0db22 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs @@ -16,7 +16,7 @@ unsafe impl Send for AlignedBufferPtr {} // SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer. unsafe impl Sync for AlignedBufferPtr {} -/// An aligned buffer type used for I/O. +/// An aligned buffer type. #[derive(Debug)] pub struct RawAlignedBuffer { ptr: AlignedBufferPtr, From 4b4a3edb804d265347a5ead9acbed4f72e499ed4 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 18 Oct 2024 17:53:11 +0000 Subject: [PATCH 20/24] review: remove allow(unused) Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs index d26cbbf948dc..dba695196ebb 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -1,5 +1,3 @@ -#![allow(unused)] - use tokio_epoll_uring::IoBufMut; use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf}; From 3b88998f8eea8c44aabac8795542280ede74e243 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 18 Oct 2024 17:53:49 +0000 Subject: [PATCH 21/24] review: clarify IoBuf safety comments Signed-off-by: Yuchen Liang --- .../owned_buffers_io/aligned_buffer/buffer_mut.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index c878ab14f878..00dbf28bd9ed 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -209,7 +209,9 @@ fn panic_advance(idx: usize, len: usize) -> ! { } /// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer, -/// and the location remains stable even if [`Self`] is moved. +/// and the underlying pointer remains stable while io-uring is owning the buffer. +/// The tokio-epoll-uring crate itself will not resize the buffer and will respect +/// [`IoBuf::bytes_total`]. unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut { fn stable_ptr(&self) -> *const u8 { self.as_ptr() From d99a61bd75183bd3f3591cae3d65d5c55b8429ef Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 18 Oct 2024 17:54:59 +0000 Subject: [PATCH 22/24] review: remove outdated todo Signed-off-by: Yuchen Liang --- pageserver/src/tenant/storage_layer/inmemory_layer.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 2cb1304f4b59..927b46d8483c 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -809,7 +809,6 @@ impl InMemoryLayer { match l0_flush_global_state { l0_flush::Inner::Direct { .. } => { let file_contents = inner.file.load_to_io_buf(ctx).await?; - // TODO(yuchen): see ways to avoid this copy. let file_contents = file_contents.freeze(); for (key, vec_map) in inner.index.iter() { From ad44e11a6937659d7dc7bd1785570a0d0a894534 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 18 Oct 2024 18:06:16 +0000 Subject: [PATCH 23/24] follow bytes::Bytes convention for AlignedBuffer Signed-off-by: Yuchen Liang --- .../src/tenant/storage_layer/inmemory_layer.rs | 2 +- .../owned_buffers_io/aligned_buffer/buffer.rs | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 927b46d8483c..7573ddb5ccc0 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -823,7 +823,7 @@ impl InMemoryLayer { len, will_init, } = entry; - let buf = file_contents.slice(pos as usize, (pos + len) as usize); + let buf = file_contents.slice(pos as usize..(pos + len) as usize); let (_buf, res) = delta_layer_writer .put_value_bytes( Key::from_compact(*key), diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 6b5535091033..2fba6d699b28 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -1,5 +1,5 @@ use std::{ - ops::{Deref, Range}, + ops::{Deref, Range, RangeBounds}, sync::Arc, }; @@ -49,9 +49,22 @@ impl AlignedBuffer { } /// Returns a slice of self for the index range `[begin..end)`. - pub fn slice(&self, begin: usize, end: usize) -> Self { + pub fn slice(&self, range: impl RangeBounds) -> Self { + use core::ops::Bound; let len = self.len(); + let begin = match range.start_bound() { + Bound::Included(&n) => n, + Bound::Excluded(&n) => n.checked_add(1).expect("out of range"), + Bound::Unbounded => 0, + }; + + let end = match range.end_bound() { + Bound::Included(&n) => n.checked_add(1).expect("out of range"), + Bound::Excluded(&n) => n, + Bound::Unbounded => len, + }; + assert!( begin <= end, "range start must not be greater than end: {:?} <= {:?}", From a5a86bedb25e4f65376c73e6925a16e928931cf6 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 18 Oct 2024 18:22:02 +0000 Subject: [PATCH 24/24] fix clippy Signed-off-by: Yuchen Liang --- .../virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index 00dbf28bd9ed..b3675d1aeabb 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -211,7 +211,7 @@ fn panic_advance(idx: usize, len: usize) -> ! { /// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer, /// and the underlying pointer remains stable while io-uring is owning the buffer. /// The tokio-epoll-uring crate itself will not resize the buffer and will respect -/// [`IoBuf::bytes_total`]. +/// [`tokio_epoll_uring::IoBuf::bytes_total`]. unsafe impl tokio_epoll_uring::IoBuf for AlignedBufferMut { fn stable_ptr(&self) -> *const u8 { self.as_ptr()