From dd1c45e8961eead106a7b76e58de1777011bff1c Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Thu, 7 Nov 2024 20:44:21 +0000 Subject: [PATCH 01/35] eliminate size_tracking_writer Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 26 +++------- .../tenant/remote_timeline_client/download.rs | 9 ++-- pageserver/src/virtual_file.rs | 3 -- .../util/size_tracking_writer.rs | 50 ------------------- .../virtual_file/owned_buffers_io/write.rs | 26 +++++++--- 5 files changed, 30 insertions(+), 84 deletions(-) delete mode 100644 pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index de0abab4c0c7..f1ce768c397d 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -8,7 +8,6 @@ use crate::page_cache; use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; -use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; use crate::virtual_file::owned_buffers_io::write::Buffer; use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; use bytes::BytesMut; @@ -27,10 +26,7 @@ pub struct EphemeralFile { _timeline_id: TimelineId, page_cache_file_id: page_cache::FileId, bytes_written: u64, - buffered_writer: owned_buffers_io::write::BufferedWriter< - BytesMut, - size_tracking_writer::Writer, - >, + buffered_writer: owned_buffers_io::write::BufferedWriter, /// Gate guard is held on as long as we need to do operations in the path (delete on drop) _gate_guard: utils::sync::gate::GateGuard, } @@ -73,7 +69,7 @@ impl EphemeralFile { page_cache_file_id, bytes_written: 0, buffered_writer: owned_buffers_io::write::BufferedWriter::new( - size_tracking_writer::Writer::new(file), + file, BytesMut::with_capacity(TAIL_SZ), ), _gate_guard: gate_guard, @@ -85,7 +81,7 @@ impl Drop for EphemeralFile { fn drop(&mut self) { // unlink the file // we are clear to do this, because we have entered a gate - let path = self.buffered_writer.as_inner().as_inner().path(); + let path = self.buffered_writer.as_inner().path(); let res = std::fs::remove_file(path); if let Err(e) = res { if e.kind() != std::io::ErrorKind::NotFound { @@ -168,8 +164,7 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst: tokio_epoll_uring::Slice, ctx: &'a RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { - let file_size_tracking_writer = self.buffered_writer.as_inner(); - let flushed_offset = file_size_tracking_writer.bytes_written(); + let flushed_offset = self.buffered_writer.bytes_written(); let buffer = self.buffered_writer.inspect_buffer(); let buffered = &buffer[0..buffer.pending()]; @@ -201,7 +196,7 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral let buffered_range = Range(std::cmp::max(start, flushed_offset), end); let dst = if written_range.len() > 0 { - let file: &VirtualFile = file_size_tracking_writer.as_inner(); + let file: &VirtualFile = self.buffered_writer.as_inner(); let bounds = dst.bounds(); let slice = file .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) @@ -359,8 +354,7 @@ mod tests { assert_eq!(&buf, &content[i..i + 1]); } - let file_contents = - std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap(); + let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap(); assert_eq!(file_contents, &content[0..cap]); let buffer_contents = file.buffered_writer.inspect_buffer(); @@ -392,13 +386,7 @@ mod tests { &file.load_to_io_buf(&ctx).await.unwrap(), &content[0..cap + cap / 2] ); - let md = file - .buffered_writer - .as_inner() - .as_inner() - .path() - .metadata() - .unwrap(); + let md = file.buffered_writer.as_inner().path().metadata().unwrap(); assert_eq!( md.len(), cap.into_u64(), diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index efcd20d1bf5c..05275ef8521a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -203,7 +203,7 @@ async fn download_object<'a>( } #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { - use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; + use crate::virtual_file::owned_buffers_io; use bytes::BytesMut; async { let destination_file = VirtualFile::create(dst_path, ctx) @@ -220,9 +220,8 @@ async fn download_object<'a>( // TODO: use vectored write (writev) once supported by tokio-epoll-uring. // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { - let size_tracking = size_tracking_writer::Writer::new(destination_file); let mut buffered = owned_buffers_io::write::BufferedWriter::::new( - size_tracking, + destination_file, BytesMut::with_capacity(super::BUFFER_SIZE), ); while let Some(res) = @@ -234,8 +233,8 @@ async fn download_object<'a>( }; buffered.write_buffered(chunk.slice_len(), ctx).await?; } - let size_tracking = buffered.flush_and_into_inner(ctx).await?; - Ok(size_tracking.into_inner()) + let inner = buffered.flush_and_into_inner(ctx).await?; + Ok(inner) } .await?; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index daa8b99ab0f1..f21b2bae1537 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -63,9 +63,6 @@ pub(crate) mod owned_buffers_io { pub(crate) mod io_buf_ext; pub(crate) mod slice; pub(crate) mod write; - pub(crate) mod util { - pub(crate) mod size_tracking_writer; - } } #[derive(Debug)] diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs deleted file mode 100644 index efcb61ba6532..000000000000 --- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs +++ /dev/null @@ -1,50 +0,0 @@ -use crate::{ - context::RequestContext, - virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter}, -}; -use tokio_epoll_uring::IoBuf; - -pub struct Writer { - dst: W, - bytes_amount: u64, -} - -impl Writer { - pub fn new(dst: W) -> Self { - Self { - dst, - bytes_amount: 0, - } - } - - pub fn bytes_written(&self) -> u64 { - self.bytes_amount - } - - pub fn as_inner(&self) -> &W { - &self.dst - } - - /// Returns the wrapped `VirtualFile` object as well as the number - /// of bytes that were written to it through this object. - #[cfg_attr(target_os = "macos", allow(dead_code))] - pub fn into_inner(self) -> (u64, W) { - (self.bytes_amount, self.dst) - } -} - -impl OwnedAsyncWriter for Writer -where - W: OwnedAsyncWriter, -{ - #[inline(always)] - async fn write_all( - &mut self, - buf: FullSlice, - ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; - self.bytes_amount += u64::try_from(nwritten).unwrap(); - Ok((nwritten, buf)) - } -} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 568cf62e5617..c4fb209dd98d 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -38,6 +38,7 @@ pub struct BufferedWriter { /// /// In these exceptional cases, it's `None`. buf: Option, + bytes_amount: u64, } impl BufferedWriter @@ -50,6 +51,7 @@ where Self { writer, buf: Some(buf), + bytes_amount: 0, } } @@ -57,18 +59,26 @@ where &self.writer } + pub fn bytes_written(&self) -> u64 { + self.bytes_amount + } + /// Panics if used after any of the write paths returned an error pub fn inspect_buffer(&self) -> &B { self.buf() } #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result { + pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<(u64, W)> { self.flush(ctx).await?; - let Self { buf, writer } = self; + let Self { + buf, + writer, + bytes_amount, + } = self; assert!(buf.is_some()); - Ok(writer) + Ok((bytes_amount, writer)) } #[inline(always)] @@ -103,6 +113,7 @@ where .writer .write_all(FullSlice::must_new(chunk), ctx) .await?; + self.bytes_amount += u64::try_from(nwritten).unwrap(); assert_eq!(nwritten, chunk_len); return Ok((nwritten, chunk)); } @@ -160,6 +171,7 @@ where } let slice = buf.flush(); let (nwritten, slice) = self.writer.write_all(slice, ctx).await?; + self.bytes_amount += u64::try_from(nwritten).unwrap(); assert_eq!(nwritten, buf_len); self.buf = Some(Buffer::reuse_after_flush( slice.into_raw_slice().into_inner(), @@ -274,7 +286,7 @@ mod tests { write!(writer, b"c"); write!(writer, b"d"); write!(writer, b"e"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] @@ -290,7 +302,7 @@ mod tests { write!(writer, b"de"); write!(writer, b""); write!(writer, b"fghijk"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] @@ -306,7 +318,7 @@ mod tests { write!(writer, b"bc"); write!(writer, b"d"); write!(writer, b"e"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] @@ -329,7 +341,7 @@ mod tests { writer.write_buffered_borrowed(b"j", ctx).await?; writer.write_buffered_borrowed(b"klmno", ctx).await?; - let recorder = writer.flush_and_into_inner(ctx).await?; + let (_, recorder) = writer.flush_and_into_inner(ctx).await?; assert_eq!( recorder.writes, { From 224cbb40254edfce4735e2fd5f5dc59c09d7cbc6 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 8 Nov 2024 15:09:33 +0000 Subject: [PATCH 02/35] change OwnedAsyncWriter trait to use write_all_at Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file.rs | 13 ++-- .../virtual_file/owned_buffers_io/write.rs | 75 +++++++++++-------- 2 files changed, 50 insertions(+), 38 deletions(-) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index f21b2bae1537..40e0e97b0ca3 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -1295,14 +1295,14 @@ impl Drop for VirtualFileInner { } impl OwnedAsyncWriter for VirtualFile { - #[inline(always)] - async fn write_all( - &mut self, + async fn write_all_at( + &self, buf: FullSlice, + offset: u64, ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; - res.map(move |v| (v, buf)) + ) -> std::io::Result> { + let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await; + res.map(|_| buf) } } @@ -1560,6 +1560,7 @@ mod tests { &ctx, ) .await?; + file_a .write_all(b"foobar".to_vec().slice_len(), &ctx) .await?; diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index c4fb209dd98d..44f40bf846cc 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -8,11 +8,12 @@ use super::io_buf_ext::{FullSlice, IoBufExt}; /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. pub trait OwnedAsyncWriter { - async fn write_all( - &mut self, + async fn write_all_at( + &self, buf: FullSlice, + offset: u64, ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)>; + ) -> std::io::Result>; } /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch @@ -109,13 +110,12 @@ where .pending(), 0 ); - let (nwritten, chunk) = self + let chunk = self .writer - .write_all(FullSlice::must_new(chunk), ctx) + .write_all_at(FullSlice::must_new(chunk), self.bytes_amount, ctx) .await?; - self.bytes_amount += u64::try_from(nwritten).unwrap(); - assert_eq!(nwritten, chunk_len); - return Ok((nwritten, chunk)); + self.bytes_amount += u64::try_from(chunk_len).unwrap(); + return Ok((chunk_len, chunk)); } // in-memory copy the < BUFFER_SIZED tail of the chunk assert!(chunk.len() < self.buf().cap()); @@ -170,9 +170,11 @@ where return Ok(()); } let slice = buf.flush(); - let (nwritten, slice) = self.writer.write_all(slice, ctx).await?; - self.bytes_amount += u64::try_from(nwritten).unwrap(); - assert_eq!(nwritten, buf_len); + let slice = self + .writer + .write_all_at(slice, self.bytes_amount, ctx) + .await?; + self.bytes_amount += u64::try_from(buf_len).unwrap(); self.buf = Some(Buffer::reuse_after_flush( slice.into_raw_slice().into_inner(), )); @@ -231,19 +233,10 @@ impl Buffer for BytesMut { } } -impl OwnedAsyncWriter for Vec { - async fn write_all( - &mut self, - buf: FullSlice, - _: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - self.extend_from_slice(&buf[..]); - Ok((buf.len(), buf)) - } -} - #[cfg(test)] mod tests { + use std::sync::Mutex; + use bytes::BytesMut; use super::*; @@ -252,16 +245,34 @@ mod tests { #[derive(Default)] struct RecorderWriter { - writes: Vec>, + /// record bytes and write offsets. + writes: Mutex, u64)>>, } + + impl RecorderWriter { + /// Gets recorded bytes and write offsets. + fn get_writes(&self) -> Vec> { + self.writes + .lock() + .unwrap() + .iter() + .map(|(buf, _)| buf.clone()) + .collect() + } + } + impl OwnedAsyncWriter for RecorderWriter { - async fn write_all( - &mut self, + async fn write_all_at( + &self, buf: FullSlice, + offset: u64, _: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - self.writes.push(Vec::from(&buf[..])); - Ok((buf.len(), buf)) + ) -> std::io::Result> { + self.writes + .lock() + .unwrap() + .push((Vec::from(&buf[..]), offset)); + Ok(buf) } } @@ -288,7 +299,7 @@ mod tests { write!(writer, b"e"); let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( - recorder.writes, + recorder.get_writes(), vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] ); Ok(()) @@ -304,7 +315,7 @@ mod tests { write!(writer, b"fghijk"); let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( - recorder.writes, + recorder.get_writes(), vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] ); Ok(()) @@ -320,7 +331,7 @@ mod tests { write!(writer, b"e"); let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( - recorder.writes, + recorder.get_writes(), vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] ); Ok(()) @@ -343,7 +354,7 @@ mod tests { let (_, recorder) = writer.flush_and_into_inner(ctx).await?; assert_eq!( - recorder.writes, + recorder.get_writes(), { let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"]; expect From f0efc908d7793b8f52150831b7a72310c42fc38e Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 8 Nov 2024 15:19:36 +0000 Subject: [PATCH 03/35] use Arc around W: OwnedAsyncWriter Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 21 +++++++++------- .../tenant/remote_timeline_client/download.rs | 13 +++++++--- .../virtual_file/owned_buffers_io/write.rs | 25 ++++++++++++------- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index f1ce768c397d..9a82c131f220 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -19,6 +19,7 @@ use tracing::error; use std::io; use std::sync::atomic::AtomicU64; +use std::sync::Arc; use utils::id::TimelineId; pub struct EphemeralFile { @@ -51,15 +52,17 @@ impl EphemeralFile { "ephemeral-{filename_disambiguator}" ))); - let file = VirtualFile::open_with_options( - &filename, - virtual_file::OpenOptions::new() - .read(true) - .write(true) - .create(true), - ctx, - ) - .await?; + let file = Arc::new( + VirtualFile::open_with_options( + &filename, + virtual_file::OpenOptions::new() + .read(true) + .write(true) + .create(true), + ctx, + ) + .await?, + ); let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 05275ef8521a..d291f4302d17 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,6 +6,7 @@ use std::collections::HashSet; use std::future::Future; use std::str::FromStr; +use std::sync::Arc; use std::time::SystemTime; use anyhow::{anyhow, Context}; @@ -206,10 +207,14 @@ async fn download_object<'a>( use crate::virtual_file::owned_buffers_io; use bytes::BytesMut; async { - let destination_file = VirtualFile::create(dst_path, ctx) - .await - .with_context(|| format!("create a destination file for layer '{dst_path}'")) - .map_err(DownloadError::Other)?; + let destination_file = Arc::new( + VirtualFile::create(dst_path, ctx) + .await + .with_context(|| { + format!("create a destination file for layer '{dst_path}'") + }) + .map_err(DownloadError::Other)?, + ); let mut download = storage .download(src_path, &DownloadOpts::default(), cancel) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 44f40bf846cc..3bd3cac09dd6 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use bytes::BytesMut; use tokio_epoll_uring::IoBuf; @@ -32,7 +34,7 @@ pub trait OwnedAsyncWriter { /// In such cases, a different implementation that always buffers in memory /// may be preferable. pub struct BufferedWriter { - writer: W, + writer: Arc, /// invariant: always remains Some(buf) except /// - while IO is ongoing => goes back to Some() once the IO completed successfully /// - after an IO error => stays `None` forever @@ -48,7 +50,7 @@ where Buf: IoBuf + Send, W: OwnedAsyncWriter, { - pub fn new(writer: W, buf: B) -> Self { + pub fn new(writer: Arc, buf: B) -> Self { Self { writer, buf: Some(buf), @@ -70,7 +72,10 @@ where } #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<(u64, W)> { + pub async fn flush_and_into_inner( + mut self, + ctx: &RequestContext, + ) -> std::io::Result<(u64, Arc)> { self.flush(ctx).await?; let Self { @@ -290,7 +295,7 @@ mod tests { #[tokio::test] async fn test_buffered_writes_only() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); + let recorder = Arc::new(RecorderWriter::default()); let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"a"); write!(writer, b"b"); @@ -307,7 +312,7 @@ mod tests { #[tokio::test] async fn test_passthrough_writes_only() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); + let recorder = Arc::new(RecorderWriter::default()); let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"abc"); write!(writer, b"de"); @@ -323,8 +328,9 @@ mod tests { #[tokio::test] async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + let recorder = Arc::new(RecorderWriter::default()); + let mut writer = + BufferedWriter::<_, RecorderWriter>::new(recorder, BytesMut::with_capacity(2)); write!(writer, b"a"); write!(writer, b"bc"); write!(writer, b"d"); @@ -341,8 +347,9 @@ mod tests { async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { let ctx = test_ctx(); let ctx = &ctx; - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + let recorder = Arc::new(RecorderWriter::default()); + let mut writer = + BufferedWriter::<_, RecorderWriter>::new(recorder, BytesMut::with_capacity(2)); writer.write_buffered_borrowed(b"abc", ctx).await?; writer.write_buffered_borrowed(b"d", ctx).await?; From 26c8b504514a4567341882e4410c91afd74b5b55 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Sat, 9 Nov 2024 18:41:41 +0000 Subject: [PATCH 04/35] implement non-generic flush handle & bg task Signed-off-by: Yuchen Liang --- .../aligned_buffer/alignment.rs | 4 +- .../owned_buffers_io/aligned_buffer/buffer.rs | 8 +- .../aligned_buffer/buffer_mut.rs | 4 + .../virtual_file/owned_buffers_io/write.rs | 53 ++++++-- .../owned_buffers_io/write/flush.rs | 116 ++++++++++++++++++ 5 files changed, 169 insertions(+), 16 deletions(-) create mode 100644 pageserver/src/virtual_file/owned_buffers_io/write/flush.rs diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs index 933b78a13b70..6b9992643f2a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs @@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static { } /// Alignment at compile time. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct ConstAlign; impl Alignment for ConstAlign { @@ -14,7 +14,7 @@ impl Alignment for ConstAlign { } /// Alignment at run time. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct RuntimeAlign { align: usize, } diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 2fba6d699b28..eab36e816309 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -3,9 +3,10 @@ use std::{ sync::Arc, }; -use super::{alignment::Alignment, raw::RawAlignedBuffer}; +use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut}; /// An shared, immutable aligned buffer type. +#[derive(Clone)] pub struct AlignedBuffer { /// Shared raw buffer. raw: Arc>, @@ -86,6 +87,11 @@ impl AlignedBuffer { range: begin..end, } } + + pub fn into_mut(self) -> Option> { + let raw = Arc::into_inner(self.raw)?; + Some(AlignedBufferMut::from_raw(raw)) + } } impl Deref for AlignedBuffer { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index b3675d1aeabb..849bf2bb07f2 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -46,6 +46,10 @@ impl AlignedBufferMut> { } impl AlignedBufferMut { + pub(super) fn from_raw(raw: RawAlignedBuffer) -> Self { + AlignedBufferMut { raw } + } + /// Returns the total number of bytes the buffer can hold. #[inline] pub fn capacity(&self) -> usize { diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 3bd3cac09dd6..c3cade596641 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,9 +1,10 @@ +mod flush; use std::sync::Arc; -use bytes::BytesMut; -use tokio_epoll_uring::IoBuf; +use bytes::{BufMut, BytesMut}; +use tokio_epoll_uring::{BoundedBufMut, IoBuf}; -use crate::context::RequestContext; +use crate::{context::RequestContext, virtual_file::IoBufferMut}; use super::io_buf_ext::{FullSlice, IoBufExt}; @@ -40,7 +41,7 @@ pub struct BufferedWriter { /// - after an IO error => stays `None` forever /// /// In these exceptional cases, it's `None`. - buf: Option, + mutable: Option, bytes_amount: u64, } @@ -53,7 +54,7 @@ where pub fn new(writer: Arc, buf: B) -> Self { Self { writer, - buf: Some(buf), + mutable: Some(buf), bytes_amount: 0, } } @@ -79,7 +80,7 @@ where self.flush(ctx).await?; let Self { - buf, + mutable: buf, writer, bytes_amount, } = self; @@ -89,7 +90,7 @@ where #[inline(always)] fn buf(&self) -> &B { - self.buf + self.mutable .as_ref() .expect("must not use after we returned an error") } @@ -109,7 +110,7 @@ where self.flush(ctx).await?; // do a big write, bypassing `buf` assert_eq!( - self.buf + self.mutable .as_ref() .expect("must not use after an error") .pending(), @@ -126,7 +127,7 @@ where assert!(chunk.len() < self.buf().cap()); let mut slice = &chunk[..]; while !slice.is_empty() { - let buf = self.buf.as_mut().expect("must not use after an error"); + let buf = self.mutable.as_mut().expect("must not use after an error"); let need = buf.cap() - buf.pending(); let have = slice.len(); let n = std::cmp::min(need, have); @@ -153,7 +154,7 @@ where ) -> std::io::Result { let chunk_len = chunk.len(); while !chunk.is_empty() { - let buf = self.buf.as_mut().expect("must not use after an error"); + let buf = self.mutable.as_mut().expect("must not use after an error"); let need = buf.cap() - buf.pending(); let have = chunk.len(); let n = std::cmp::min(need, have); @@ -168,10 +169,10 @@ where } async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { - let buf = self.buf.take().expect("must not use after an error"); + let buf = self.mutable.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { - self.buf = Some(buf); + self.mutable = Some(buf); return Ok(()); } let slice = buf.flush(); @@ -180,7 +181,7 @@ where .write_all_at(slice, self.bytes_amount, ctx) .await?; self.bytes_amount += u64::try_from(buf_len).unwrap(); - self.buf = Some(Buffer::reuse_after_flush( + self.mutable = Some(Buffer::reuse_after_flush( slice.into_raw_slice().into_inner(), )); Ok(()) @@ -238,6 +239,32 @@ impl Buffer for BytesMut { } } +impl Buffer for IoBufferMut { + type IoBuf = IoBufferMut; + + fn cap(&self) -> usize { + self.capacity() + } + + fn extend_from_slice(&mut self, other: &[u8]) { + self.reserve(other.len()); + BoundedBufMut::put_slice(self, other); + } + + fn pending(&self) -> usize { + self.len() + } + + fn flush(self) -> FullSlice { + self.slice_len() + } + + fn reuse_after_flush(mut iobuf: Self::IoBuf) -> Self { + iobuf.clear(); + iobuf + } +} + #[cfg(test)] mod tests { use std::sync::Mutex; diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs new file mode 100644 index 000000000000..86cb2099e6c6 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -0,0 +1,116 @@ +use std::sync::Arc; + +use tokio::sync::mpsc; + +use crate::{ + context::RequestContext, + virtual_file::{IoBuffer, IoBufferMut, VirtualFile}, +}; + +use super::{IoBufExt, OwnedAsyncWriter}; + +pub struct Duplex { + pub tx: mpsc::Sender, + pub rx: mpsc::Receiver, +} + +pub fn duplex_channel(buffer: usize) -> (Duplex, Duplex) { + let (tx_a, rx_a) = mpsc::channel::(buffer); + let (tx_b, rx_b) = mpsc::channel::(buffer); + + (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a }) +} + +impl Duplex { + pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError> { + self.tx.send(x).await + } + + pub async fn recv(&mut self) -> Option { + self.rx.recv().await + } +} + +/// A handle to the flush task. +pub struct FlushHandle { + /// A bi-directional channel that sends (buffer, offset) for writes, + /// and receives recyled buffer. + channel: Duplex<(IoBuffer, u64), IoBuffer>, + /// + maybe_flushed: Option, + join_handle: tokio::task::JoinHandle>, +} + +pub struct FlushBackgroundTask { + channel: Duplex, + file: Arc, + ctx: RequestContext, +} + +impl FlushBackgroundTask { + fn new( + channel: Duplex, + file: Arc, + ctx: RequestContext, + ) -> Self { + FlushBackgroundTask { channel, file, ctx } + } + + async fn run(mut self, buf: IoBufferMut) -> anyhow::Result<()> { + self.channel.send(buf.freeze()).await?; + + while let Some((mut buf, offset)) = self.channel.recv().await { + let slice = OwnedAsyncWriter::write_all_at( + self.file.as_ref(), + buf.slice_len(), + offset, + &self.ctx, + ) + .await?; + buf = slice.into_raw_slice().into_inner(); + + if self.channel.send(buf).await.is_err() { + // Channel closed, exit task. + break; + } + } + + Ok(()) + } +} + +impl FlushHandle { + pub fn spawn_new(file: Arc, ctx: RequestContext) -> Self { + let (front, back) = duplex_channel(1); + + let bg = FlushBackgroundTask::new(back, file, ctx); + let buf = IoBufferMut::with_capacity(4096); + let join_handle = tokio::spawn(async move { bg.run(buf).await }); + + FlushHandle { + channel: front, + maybe_flushed: None, + join_handle, + } + } + /// Submits a buffer to be flushed in the background task. + /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged. + async fn flush(&mut self, buf: IoBufferMut, offset: u64) -> IoBufferMut { + // Send + let freezed = buf.freeze(); + self.maybe_flushed.replace(freezed.clone()); + self.channel.send((freezed, offset)).await.unwrap(); + + // Wait for an available buffer from the background flush task. + let recycled = self.channel.recv().await.unwrap(); + + // The only other place that could hold a reference to the recycled buffer + // is in `Self::maybe_flushed`, but we have already replace it with the new buffer. + let mut recycled = recycled + .into_mut() + .expect("buffer should only have one strong reference"); + + recycled.clear(); + recycled + } +} From 45998046f3d606be662f8161f6286b59a51cdeea Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Sun, 10 Nov 2024 19:07:45 +0000 Subject: [PATCH 05/35] make flush handle & task generic Signed-off-by: Yuchen Liang --- .../owned_buffers_io/io_buf_ext.rs | 5 + .../virtual_file/owned_buffers_io/write.rs | 26 ++- .../owned_buffers_io/write/flush.rs | 158 ++++++++++++------ 3 files changed, 127 insertions(+), 62 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index c3940cf6cea2..ede04d49570e 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -27,6 +27,11 @@ where let FullSlice { slice: s } = self; s } + + pub(crate) fn as_raw_slice(&self) -> &Slice { + let FullSlice { slice: s } = &self; + s + } } impl Deref for FullSlice diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index c3cade596641..f383ad572362 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,22 +1,25 @@ mod flush; use std::sync::Arc; -use bytes::{BufMut, BytesMut}; +use bytes::BytesMut; use tokio_epoll_uring::{BoundedBufMut, IoBuf}; -use crate::{context::RequestContext, virtual_file::IoBufferMut}; +use crate::{ + context::RequestContext, + virtual_file::{IoBuffer, IoBufferMut}, +}; use super::io_buf_ext::{FullSlice, IoBufExt}; /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. pub trait OwnedAsyncWriter { - async fn write_all_at( + fn write_all_at( &self, buf: FullSlice, offset: u64, ctx: &RequestContext, - ) -> std::io::Result>; + ) -> impl std::future::Future>> + Send; } /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch @@ -212,6 +215,8 @@ pub trait Buffer { fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; } +pub trait BufferMut: Buffer {} + impl Buffer for BytesMut { type IoBuf = BytesMut; @@ -240,7 +245,7 @@ impl Buffer for BytesMut { } impl Buffer for IoBufferMut { - type IoBuf = IoBufferMut; + type IoBuf = IoBuffer; fn cap(&self) -> usize { self.capacity() @@ -256,12 +261,15 @@ impl Buffer for IoBufferMut { } fn flush(self) -> FullSlice { - self.slice_len() + self.freeze().slice_len() } - fn reuse_after_flush(mut iobuf: Self::IoBuf) -> Self { - iobuf.clear(); - iobuf + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { + let mut recycled = iobuf + .into_mut() + .expect("buffer should only have one strong reference"); + recycled.clear(); + recycled } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 86cb2099e6c6..97b9f280092d 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -1,19 +1,23 @@ use std::sync::Arc; use tokio::sync::mpsc; +use tokio_epoll_uring::IoBuf; -use crate::{ - context::RequestContext, - virtual_file::{IoBuffer, IoBufferMut, VirtualFile}, -}; +use crate::{context::RequestContext, virtual_file::owned_buffers_io::io_buf_ext::FullSlice}; -use super::{IoBufExt, OwnedAsyncWriter}; +use super::{Buffer, OwnedAsyncWriter}; +/// A bi-directional channel. pub struct Duplex { pub tx: mpsc::Sender, pub rx: mpsc::Receiver, } +/// Creates a bi-directional channel. +/// +/// The channel will buffer up to the provided number of messages. Once the buffer is full, +/// attempts to send new messages will wait until a message is received from the channel. +/// The provided buffer capacity must be at least 1. pub fn duplex_channel(buffer: usize) -> (Duplex, Duplex) { let (tx_a, rx_a) = mpsc::channel::(buffer); let (tx_b, rx_b) = mpsc::channel::(buffer); @@ -22,95 +26,143 @@ pub fn duplex_channel(buffer: usize) -> (Duplex, Duplex< } impl Duplex { + /// Sends a value, waiting until there is capacity. + /// + /// A successful send occurs when it is determined that the other end of the channel has not hung up already. pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError> { self.tx.send(x).await } + /// Receives the next value for this receiver. + /// + /// This method returns `None` if the channel has been closed and there are + /// no remaining messages in the channel's buffer. pub async fn recv(&mut self) -> Option { self.rx.recv().await } } -/// A handle to the flush task. -pub struct FlushHandle { +pub struct FlushHandleInner { /// A bi-directional channel that sends (buffer, offset) for writes, /// and receives recyled buffer. - channel: Duplex<(IoBuffer, u64), IoBuffer>, - /// - maybe_flushed: Option, - join_handle: tokio::task::JoinHandle>, + channel: Duplex<(FullSlice, u64), FullSlice>, + /// Join handle for the background flush task. + join_handle: tokio::task::JoinHandle>>, } -pub struct FlushBackgroundTask { - channel: Duplex, - file: Arc, +/// A handle to the flush task. +pub struct FlushHandle { + inner: Option>, + /// Buffer for serving tail reads. + maybe_flushed: Option, +} + +pub struct FlushBackgroundTask { + /// A bi-directional channel that receives (buffer, offset) for writes, + /// and send back recycled buffer. + channel: Duplex, (FullSlice, u64)>, + writer: Arc, ctx: RequestContext, } -impl FlushBackgroundTask { +impl FlushBackgroundTask +where + Buf: IoBuf + Send + Sync, + W: OwnedAsyncWriter + Sync + 'static, +{ fn new( - channel: Duplex, - file: Arc, + channel: Duplex, (FullSlice, u64)>, + file: Arc, ctx: RequestContext, ) -> Self { - FlushBackgroundTask { channel, file, ctx } + FlushBackgroundTask { + channel, + writer: file, + ctx, + } } - async fn run(mut self, buf: IoBufferMut) -> anyhow::Result<()> { - self.channel.send(buf.freeze()).await?; - - while let Some((mut buf, offset)) = self.channel.recv().await { - let slice = OwnedAsyncWriter::write_all_at( - self.file.as_ref(), - buf.slice_len(), - offset, - &self.ctx, - ) - .await?; - buf = slice.into_raw_slice().into_inner(); - - if self.channel.send(buf).await.is_err() { - // Channel closed, exit task. - break; + /// Runs the background flush task. + async fn run(mut self, buf: FullSlice) -> std::io::Result> { + self.channel.send(buf).await.map_err(|_| { + std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early") + })?; + + // Exit condition: channel is closed and there is no remaining buffer to be flushed + while let Some((slice, offset)) = self.channel.recv().await { + let slice = self.writer.write_all_at(slice, offset, &self.ctx).await?; + + if self.channel.send(slice).await.is_err() { + // Although channel is closed. Still need to finish flushing the remaining buffers. + continue; } } - Ok(()) + Ok(self.writer) } } -impl FlushHandle { - pub fn spawn_new(file: Arc, ctx: RequestContext) -> Self { - let (front, back) = duplex_channel(1); +impl FlushHandle +where + Buf: IoBuf + Send + Sync + Clone, + W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, +{ + /// Spawns a new background flush task and obtains a handle. + pub fn spawn_new(file: Arc, buf: B, ctx: RequestContext) -> Self + where + B: Buffer + Send + 'static, + { + let (front, back) = duplex_channel(2); let bg = FlushBackgroundTask::new(back, file, ctx); - let buf = IoBufferMut::with_capacity(4096); - let join_handle = tokio::spawn(async move { bg.run(buf).await }); + let join_handle = tokio::spawn(async move { bg.run(buf.flush()).await }); FlushHandle { - channel: front, + inner: Some(FlushHandleInner { + channel: front, + join_handle, + }), maybe_flushed: None, - join_handle, } } + /// Submits a buffer to be flushed in the background task. /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged. - async fn flush(&mut self, buf: IoBufferMut, offset: u64) -> IoBufferMut { - // Send - let freezed = buf.freeze(); - self.maybe_flushed.replace(freezed.clone()); - self.channel.send((freezed, offset)).await.unwrap(); + async fn flush(&mut self, buf: B, offset: u64) -> std::io::Result + where + B: Buffer + Send + 'static, + { + let freezed = buf.flush(); + + self.maybe_flushed + .replace(freezed.as_raw_slice().get_ref().clone()); + + let submit = self.inner_mut().channel.send((freezed, offset)).await; + + if submit.is_err() { + return self.handle_error().await; + } // Wait for an available buffer from the background flush task. - let recycled = self.channel.recv().await.unwrap(); + let Some(recycled) = self.inner_mut().channel.recv().await else { + return self.handle_error().await; + }; // The only other place that could hold a reference to the recycled buffer // is in `Self::maybe_flushed`, but we have already replace it with the new buffer. - let mut recycled = recycled - .into_mut() - .expect("buffer should only have one strong reference"); + Ok(Buffer::reuse_after_flush( + recycled.into_raw_slice().into_inner(), + )) + } + + fn inner_mut(&mut self) -> &mut FlushHandleInner { + self.inner.as_mut().unwrap() + } - recycled.clear(); - recycled + async fn handle_error(&mut self) -> std::io::Result { + let handle = self.inner.take().unwrap(); + drop(handle.channel.tx); + let e = handle.join_handle.await.unwrap().unwrap_err(); + return Err(e); } } From bdffc352e7583c76126648dd1ff1c3e5e32fefa5 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Sun, 10 Nov 2024 20:01:52 +0000 Subject: [PATCH 06/35] use background flush for write path; read path broken Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 3 +- .../tenant/remote_timeline_client/download.rs | 3 +- .../virtual_file/owned_buffers_io/write.rs | 48 +++++++++---------- .../owned_buffers_io/write/flush.rs | 21 +++++--- 4 files changed, 43 insertions(+), 32 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 9a82c131f220..adebf6758653 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -73,7 +73,8 @@ impl EphemeralFile { bytes_written: 0, buffered_writer: owned_buffers_io::write::BufferedWriter::new( file, - BytesMut::with_capacity(TAIL_SZ), + || BytesMut::with_capacity(TAIL_SZ), + ctx, ), _gate_guard: gate_guard, }) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d291f4302d17..c39a95d45278 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -227,7 +227,8 @@ async fn download_object<'a>( let (bytes_amount, destination_file) = async { let mut buffered = owned_buffers_io::write::BufferedWriter::::new( destination_file, - BytesMut::with_capacity(super::BUFFER_SIZE), + || BytesMut::with_capacity(super::BUFFER_SIZE), + ctx, ); while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index f383ad572362..6e478d840bf2 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -2,6 +2,7 @@ mod flush; use std::sync::Arc; use bytes::BytesMut; +use flush::FlushHandle; use tokio_epoll_uring::{BoundedBufMut, IoBuf}; use crate::{ @@ -37,7 +38,7 @@ pub trait OwnedAsyncWriter { /// /// In such cases, a different implementation that always buffers in memory /// may be preferable. -pub struct BufferedWriter { +pub struct BufferedWriter { writer: Arc, /// invariant: always remains Some(buf) except /// - while IO is ongoing => goes back to Some() once the IO completed successfully @@ -45,19 +46,21 @@ pub struct BufferedWriter { /// /// In these exceptional cases, it's `None`. mutable: Option, + flush_handle: FlushHandle, bytes_amount: u64, } impl BufferedWriter where - B: Buffer + Send, - Buf: IoBuf + Send, - W: OwnedAsyncWriter, + B: Buffer + Send + 'static, + Buf: IoBuf + Send + Sync + Clone, + W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { - pub fn new(writer: Arc, buf: B) -> Self { + pub fn new(writer: Arc, buf_new: impl Fn() -> B, ctx: &RequestContext) -> Self { Self { - writer, - mutable: Some(buf), + writer: writer.clone(), + mutable: Some(buf_new()), + flush_handle: FlushHandle::spawn_new(writer, buf_new(), ctx.attached_child()), bytes_amount: 0, } } @@ -85,8 +88,10 @@ where let Self { mutable: buf, writer, + mut flush_handle, bytes_amount, } = self; + flush_handle.shutdown().await?; assert!(buf.is_some()); Ok((bytes_amount, writer)) } @@ -110,6 +115,7 @@ where let chunk_len = chunk.len(); // avoid memcpy for the middle of the chunk if chunk.len() >= self.buf().cap() { + // TODO(yuchen): do we still want to keep this? self.flush(ctx).await?; // do a big write, bypassing `buf` assert_eq!( @@ -171,22 +177,16 @@ where Ok(chunk_len) } - async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { + async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result<()> { let buf = self.mutable.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { self.mutable = Some(buf); return Ok(()); } - let slice = buf.flush(); - let slice = self - .writer - .write_all_at(slice, self.bytes_amount, ctx) - .await?; + let recycled = self.flush_handle.flush(buf, self.bytes_amount).await?; self.bytes_amount += u64::try_from(buf_len).unwrap(); - self.mutable = Some(Buffer::reuse_after_flush( - slice.into_raw_slice().into_inner(), - )); + self.mutable = Some(recycled); Ok(()) } } @@ -215,8 +215,6 @@ pub trait Buffer { fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; } -pub trait BufferMut: Buffer {} - impl Buffer for BytesMut { type IoBuf = BytesMut; @@ -283,7 +281,7 @@ mod tests { use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::TaskKind; - #[derive(Default)] + #[derive(Default, Debug)] struct RecorderWriter { /// record bytes and write offsets. writes: Mutex, u64)>>, @@ -331,7 +329,8 @@ mod tests { #[tokio::test] async fn test_buffered_writes_only() -> std::io::Result<()> { let recorder = Arc::new(RecorderWriter::default()); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let mut writer = BufferedWriter::new(recorder, || BytesMut::with_capacity(2), &ctx); write!(writer, b"a"); write!(writer, b"b"); write!(writer, b"c"); @@ -348,7 +347,8 @@ mod tests { #[tokio::test] async fn test_passthrough_writes_only() -> std::io::Result<()> { let recorder = Arc::new(RecorderWriter::default()); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + let ctx = test_ctx(); + let mut writer = BufferedWriter::new(recorder, || BytesMut::with_capacity(2), &ctx); write!(writer, b"abc"); write!(writer, b"de"); write!(writer, b""); @@ -364,8 +364,8 @@ mod tests { #[tokio::test] async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { let recorder = Arc::new(RecorderWriter::default()); - let mut writer = - BufferedWriter::<_, RecorderWriter>::new(recorder, BytesMut::with_capacity(2)); + let ctx = test_ctx(); + let mut writer = BufferedWriter::new(recorder, || BytesMut::with_capacity(2), &ctx); write!(writer, b"a"); write!(writer, b"bc"); write!(writer, b"d"); @@ -384,7 +384,7 @@ mod tests { let ctx = &ctx; let recorder = Arc::new(RecorderWriter::default()); let mut writer = - BufferedWriter::<_, RecorderWriter>::new(recorder, BytesMut::with_capacity(2)); + BufferedWriter::<_, RecorderWriter>::new(recorder, || BytesMut::with_capacity(2), ctx); writer.write_buffered_borrowed(b"abc", ctx).await?; writer.write_buffered_borrowed(b"d", ctx).await?; diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 97b9f280092d..35057ba4bfca 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -128,7 +128,7 @@ where /// Submits a buffer to be flushed in the background task. /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged. - async fn flush(&mut self, buf: B, offset: u64) -> std::io::Result + pub async fn flush(&mut self, buf: B, offset: u64) -> std::io::Result where B: Buffer + Send + 'static, { @@ -155,14 +155,23 @@ where )) } + /// Cleans up the channel, join the flush task. + pub async fn shutdown(&mut self) -> std::io::Result> { + let handle = self + .inner + .take() + .expect("must not use after we returned an error"); + drop(handle.channel.tx); + handle.join_handle.await.unwrap() + } + fn inner_mut(&mut self) -> &mut FlushHandleInner { - self.inner.as_mut().unwrap() + self.inner + .as_mut() + .expect("must not use after we returned an error") } async fn handle_error(&mut self) -> std::io::Result { - let handle = self.inner.take().unwrap(); - drop(handle.channel.tx); - let e = handle.join_handle.await.unwrap().unwrap_err(); - return Err(e); + Err(self.shutdown().await.unwrap_err()) } } From e0848c28d958f708d523442a10be915e648bdc31 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 11 Nov 2024 03:22:01 +0000 Subject: [PATCH 07/35] make InMemory read aware of mutable & maybe_flushed Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 84 +++++++++++++++---- .../virtual_file/owned_buffers_io/write.rs | 16 ++-- .../owned_buffers_io/write/flush.rs | 6 +- 3 files changed, 82 insertions(+), 24 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index adebf6758653..7b67570bf4bb 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -17,9 +17,9 @@ use pageserver_api::shard::TenantShardId; use tokio_epoll_uring::{BoundedBuf, Slice}; use tracing::error; -use std::io; use std::sync::atomic::AtomicU64; use std::sync::Arc; +use std::{io, u64}; use utils::id::TimelineId; pub struct EphemeralFile { @@ -170,8 +170,10 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { let flushed_offset = self.buffered_writer.bytes_written(); - let buffer = self.buffered_writer.inspect_buffer(); - let buffered = &buffer[0..buffer.pending()]; + let mutable = self.buffered_writer.inspect_mutable(); + let mutable = &mutable[0..mutable.pending()]; + + let maybe_flushed = self.buffered_writer.inspect_maybe_flushed(); let dst_cap = dst.bytes_total().into_u64(); let end = { @@ -196,8 +198,34 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral } } } - let written_range = Range(start, std::cmp::min(end, flushed_offset)); - let buffered_range = Range(std::cmp::max(start, flushed_offset), end); + + // [ written ][ maybe_flushed ][ mutable ] + // |- TAIL_SZ -||- TAIL_SZ -| + // ^ + // `flushed_offset` + // + let (written_range, maybe_flushed_range) = { + if maybe_flushed.is_some() { + ( + Range( + start, + std::cmp::min(end, flushed_offset.saturating_sub(TAIL_SZ as u64)), + ), + Range( + std::cmp::max(start, flushed_offset.saturating_sub(TAIL_SZ as u64)), + std::cmp::min(end, flushed_offset), + ), + ) + } else { + ( + Range(start, std::cmp::min(end, flushed_offset)), + // zero len + Range(flushed_offset, u64::MIN), + ) + } + }; + + let mutable_range = Range(std::cmp::max(start, flushed_offset), end); let dst = if written_range.len() > 0 { let file: &VirtualFile = self.buffered_writer.as_inner(); @@ -210,19 +238,21 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst }; - let dst = if buffered_range.len() > 0 { - let offset_in_buffer = buffered_range + let dst = if maybe_flushed_range.len() > 0 { + let offset_in_buffer = maybe_flushed_range .0 - .checked_sub(flushed_offset) + .checked_sub(flushed_offset.saturating_sub(TAIL_SZ as u64)) .unwrap() .into_usize(); - let to_copy = - &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())]; + // Checked previously the buffer is Some. + let maybe_flushed = maybe_flushed.unwrap(); + let to_copy = &maybe_flushed + [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())]; let bounds = dst.bounds(); let mut view = dst.slice({ let start = written_range.len().into_usize(); let end = start - .checked_add(buffered_range.len().into_usize()) + .checked_add(maybe_flushed_range.len().into_usize()) .unwrap(); start..end }); @@ -233,6 +263,28 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst }; + let dst = if mutable_range.len() > 0 { + let offset_in_buffer = mutable_range + .0 + .checked_sub(flushed_offset) + .unwrap() + .into_usize(); + let to_copy = + &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())]; + let bounds = dst.bounds(); + let mut view = dst.slice({ + let start = + written_range.len().into_usize() + maybe_flushed_range.len().into_usize(); + let end = start.checked_add(mutable_range.len().into_usize()).unwrap(); + start..end + }); + view.as_mut_rust_slice_full_zeroed() + .copy_from_slice(to_copy); + Slice::from_buf_bounds(Slice::into_inner(view), bounds) + } else { + dst + }; + // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs Ok((dst, (end - start).into_usize())) @@ -330,7 +382,7 @@ mod tests { .await .unwrap(); - let cap = file.buffered_writer.inspect_buffer().capacity(); + let cap = file.buffered_writer.inspect_mutable().capacity(); let write_nbytes = cap + cap / 2; @@ -361,7 +413,7 @@ mod tests { let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap(); assert_eq!(file_contents, &content[0..cap]); - let buffer_contents = file.buffered_writer.inspect_buffer(); + let buffer_contents = file.buffered_writer.inspect_mutable(); assert_eq!(buffer_contents, &content[cap..write_nbytes]); } @@ -376,7 +428,7 @@ mod tests { .await .unwrap(); - let cap = file.buffered_writer.inspect_buffer().capacity(); + let cap = file.buffered_writer.inspect_mutable().capacity(); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) @@ -397,7 +449,7 @@ mod tests { "buffered writer does one write if we write 1.5x buffer capacity" ); assert_eq!( - &file.buffered_writer.inspect_buffer()[0..cap / 2], + &file.buffered_writer.inspect_mutable()[0..cap / 2], &content[cap..cap + cap / 2] ); } @@ -419,7 +471,7 @@ mod tests { .await .unwrap(); - let cap = file.buffered_writer.inspect_buffer().capacity(); + let cap = file.buffered_writer.inspect_mutable().capacity(); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 6e478d840bf2..cf0157bd758d 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -74,8 +74,13 @@ where } /// Panics if used after any of the write paths returned an error - pub fn inspect_buffer(&self) -> &B { - self.buf() + pub fn inspect_mutable(&self) -> &B { + self.mutable() + } + + /// Gets a reference to the maybe flushed read-only buffer. + pub fn inspect_maybe_flushed(&self) -> Option<&Buf> { + self.flush_handle.maybe_flushed.as_ref() } #[cfg_attr(target_os = "macos", allow(dead_code))] @@ -96,8 +101,9 @@ where Ok((bytes_amount, writer)) } + /// Gets a reference to the mutable in-memory buffer. #[inline(always)] - fn buf(&self) -> &B { + fn mutable(&self) -> &B { self.mutable .as_ref() .expect("must not use after we returned an error") @@ -114,7 +120,7 @@ where let chunk_len = chunk.len(); // avoid memcpy for the middle of the chunk - if chunk.len() >= self.buf().cap() { + if chunk.len() >= self.mutable().cap() { // TODO(yuchen): do we still want to keep this? self.flush(ctx).await?; // do a big write, bypassing `buf` @@ -133,7 +139,7 @@ where return Ok((chunk_len, chunk)); } // in-memory copy the < BUFFER_SIZED tail of the chunk - assert!(chunk.len() < self.buf().cap()); + assert!(chunk.len() < self.mutable().cap()); let mut slice = &chunk[..]; while !slice.is_empty() { let buf = self.mutable.as_mut().expect("must not use after an error"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 35057ba4bfca..9b5f36a5d334 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -54,7 +54,7 @@ pub struct FlushHandleInner { pub struct FlushHandle { inner: Option>, /// Buffer for serving tail reads. - maybe_flushed: Option, + pub(super) maybe_flushed: Option, } pub struct FlushBackgroundTask { @@ -83,8 +83,8 @@ where } /// Runs the background flush task. - async fn run(mut self, buf: FullSlice) -> std::io::Result> { - self.channel.send(buf).await.map_err(|_| { + async fn run(mut self, slice: FullSlice) -> std::io::Result> { + self.channel.send(slice).await.map_err(|_| { std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early") })?; From e5bb85d407cd9906878ee7b08ce0bb1ce4cd8efd Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 11 Nov 2024 21:33:33 +0000 Subject: [PATCH 08/35] fix clippy Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 7b67570bf4bb..6e36b9756052 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -17,9 +17,9 @@ use pageserver_api::shard::TenantShardId; use tokio_epoll_uring::{BoundedBuf, Slice}; use tracing::error; +use std::io; use std::sync::atomic::AtomicU64; use std::sync::Arc; -use std::{io, u64}; use utils::id::TimelineId; pub struct EphemeralFile { From 7b34e73c15fa50243b48cd480e8ae5d956754041 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 11 Nov 2024 22:35:40 +0000 Subject: [PATCH 09/35] fix tests Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 41 +++++++++++++------ .../virtual_file/owned_buffers_io/write.rs | 2 +- .../owned_buffers_io/write/flush.rs | 1 + 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 6e36b9756052..7f4bf6b9aa6d 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -384,7 +384,7 @@ mod tests { let cap = file.buffered_writer.inspect_mutable().capacity(); - let write_nbytes = cap + cap / 2; + let write_nbytes = cap * 2 + cap / 2; let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) @@ -411,10 +411,13 @@ mod tests { } let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap(); - assert_eq!(file_contents, &content[0..cap]); + assert!(file_contents == &content[0..cap] || file_contents == &content[0..cap * 2]); - let buffer_contents = file.buffered_writer.inspect_mutable(); - assert_eq!(buffer_contents, &content[cap..write_nbytes]); + let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap(); + assert_eq!(maybe_flushed_buffer_contents, &content[cap..cap * 2]); + + let mutable_buffer_contents = file.buffered_writer.inspect_mutable(); + assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]); } #[tokio::test] @@ -428,11 +431,12 @@ mod tests { .await .unwrap(); + // mutable buffer and maybe_flushed buffer each has cap. let cap = file.buffered_writer.inspect_mutable().capacity(); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) - .take(cap + cap / 2) + .take(cap * 2 + cap / 2) .collect(); file.write_raw(&content, &ctx).await.unwrap(); @@ -440,17 +444,20 @@ mod tests { // assert the state is as this test expects it to be assert_eq!( &file.load_to_io_buf(&ctx).await.unwrap(), - &content[0..cap + cap / 2] + &content[0..cap * 2 + cap / 2] ); let md = file.buffered_writer.as_inner().path().metadata().unwrap(); + assert!( + md.len() == cap.into_u64() || md.len() == 2 * cap.into_u64(), + "buffered writer requires one write to be flushed if we write 2.5x buffer capacity" + ); assert_eq!( - md.len(), - cap.into_u64(), - "buffered writer does one write if we write 1.5x buffer capacity" + &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap], + &content[cap..cap * 2] ); assert_eq!( &file.buffered_writer.inspect_mutable()[0..cap / 2], - &content[cap..cap + cap / 2] + &content[cap * 2..cap * 2 + cap / 2] ); } @@ -475,7 +482,7 @@ mod tests { let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) - .take(cap + cap / 2) + .take(cap * 2 + cap / 2) .collect(); file.write_raw(&content, &ctx).await.unwrap(); @@ -505,9 +512,17 @@ mod tests { test_read(cap - 10, 10).await; // read across file and buffer test_read(cap - 10, 20).await; - // stay from start of buffer + // stay from start of maybe flushed buffer test_read(cap, 10).await; - // completely within buffer + // completely within maybe flushed buffer test_read(cap + 10, 10).await; + // border onto edge of maybe flushed buffer. + test_read(cap * 2 - 10, 10).await; + // read across maybe flushed and mutable buffer + test_read(cap * 2 - 10, 20).await; + // read across three segments + test_read(cap - 10, cap + 20).await; + // completely within mutable buffer + test_read(cap * 2 + 10, 10).await; } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index cf0157bd758d..f0399681f1cb 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -121,7 +121,7 @@ where let chunk_len = chunk.len(); // avoid memcpy for the middle of the chunk if chunk.len() >= self.mutable().cap() { - // TODO(yuchen): do we still want to keep this? + // TODO(yuchen): do we still want to keep the bypass path? self.flush(ctx).await?; // do a big write, bypassing `buf` assert_eq!( diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 9b5f36a5d334..a9c1404712fc 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -42,6 +42,7 @@ impl Duplex { } } +// TODO(yuchen): special actions in drop to clean up the join handle? pub struct FlushHandleInner { /// A bi-directional channel that sends (buffer, offset) for writes, /// and receives recyled buffer. From b0d7fc75641ffc4571a055818864cb26dd64f869 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 11 Nov 2024 23:49:52 +0000 Subject: [PATCH 10/35] fix IoBufferMut::extend_from_slice Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 5 ++- .../owned_buffers_io/aligned_buffer/buffer.rs | 2 +- .../aligned_buffer/buffer_mut.rs | 33 ++++++++++++++++++- .../virtual_file/owned_buffers_io/write.rs | 30 +++++++++-------- 4 files changed, 52 insertions(+), 18 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 7f4bf6b9aa6d..9979d1980fce 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -10,7 +10,6 @@ use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; use crate::virtual_file::owned_buffers_io::write::Buffer; use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; -use bytes::BytesMut; use camino::Utf8PathBuf; use num_traits::Num; use pageserver_api::shard::TenantShardId; @@ -27,7 +26,7 @@ pub struct EphemeralFile { _timeline_id: TimelineId, page_cache_file_id: page_cache::FileId, bytes_written: u64, - buffered_writer: owned_buffers_io::write::BufferedWriter, + buffered_writer: owned_buffers_io::write::BufferedWriter, /// Gate guard is held on as long as we need to do operations in the path (delete on drop) _gate_guard: utils::sync::gate::GateGuard, } @@ -73,7 +72,7 @@ impl EphemeralFile { bytes_written: 0, buffered_writer: owned_buffers_io::write::BufferedWriter::new( file, - || BytesMut::with_capacity(TAIL_SZ), + || IoBufferMut::with_capacity(TAIL_SZ), ctx, ), _gate_guard: gate_guard, diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index eab36e816309..4474303b4ff5 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -6,7 +6,7 @@ use std::{ use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut}; /// An shared, immutable aligned buffer type. -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct AlignedBuffer { /// Shared raw buffer. raw: Arc>, diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index 849bf2bb07f2..08eafcb07726 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -1,4 +1,7 @@ -use std::ops::{Deref, DerefMut}; +use std::{ + mem::MaybeUninit, + ops::{Deref, DerefMut}, +}; use super::{ alignment::{Alignment, ConstAlign}, @@ -132,6 +135,34 @@ impl AlignedBufferMut { let len = self.len(); AlignedBuffer::from_raw(self.raw, 0..len) } + + #[inline] + pub fn extend_from_slice(&mut self, extend: &[u8]) { + let cnt = extend.len(); + self.reserve(cnt); + + unsafe { + let dst = self.spare_capacity_mut(); + // Reserved above + debug_assert!(dst.len() >= cnt); + + core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt); + } + + unsafe { + bytes::BufMut::advance_mut(self, cnt); + } + } + + #[inline] + fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit] { + unsafe { + let ptr = self.as_mut_ptr().add(self.len()); + let len = self.capacity() - self.len(); + + core::slice::from_raw_parts_mut(ptr.cast(), len) + } + } } impl Deref for AlignedBufferMut { diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index f0399681f1cb..429138bc765d 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use bytes::BytesMut; use flush::FlushHandle; -use tokio_epoll_uring::{BoundedBufMut, IoBuf}; +use tokio_epoll_uring::IoBuf; use crate::{ context::RequestContext, @@ -131,10 +131,13 @@ where .pending(), 0 ); - let chunk = self - .writer - .write_all_at(FullSlice::must_new(chunk), self.bytes_amount, ctx) - .await?; + let chunk = OwnedAsyncWriter::write_all_at( + self.writer.as_ref(), + FullSlice::must_new(chunk), + self.bytes_amount, + ctx, + ) + .await?; self.bytes_amount += u64::try_from(chunk_len).unwrap(); return Ok((chunk_len, chunk)); } @@ -257,7 +260,7 @@ impl Buffer for IoBufferMut { fn extend_from_slice(&mut self, other: &[u8]) { self.reserve(other.len()); - BoundedBufMut::put_slice(self, other); + IoBufferMut::extend_from_slice(self, other); } fn pending(&self) -> usize { @@ -281,8 +284,6 @@ impl Buffer for IoBufferMut { mod tests { use std::sync::Mutex; - use bytes::BytesMut; - use super::*; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::TaskKind; @@ -336,7 +337,7 @@ mod tests { async fn test_buffered_writes_only() -> std::io::Result<()> { let recorder = Arc::new(RecorderWriter::default()); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); - let mut writer = BufferedWriter::new(recorder, || BytesMut::with_capacity(2), &ctx); + let mut writer = BufferedWriter::new(recorder, || IoBufferMut::with_capacity(2), &ctx); write!(writer, b"a"); write!(writer, b"b"); write!(writer, b"c"); @@ -354,7 +355,7 @@ mod tests { async fn test_passthrough_writes_only() -> std::io::Result<()> { let recorder = Arc::new(RecorderWriter::default()); let ctx = test_ctx(); - let mut writer = BufferedWriter::new(recorder, || BytesMut::with_capacity(2), &ctx); + let mut writer = BufferedWriter::new(recorder, || IoBufferMut::with_capacity(2), &ctx); write!(writer, b"abc"); write!(writer, b"de"); write!(writer, b""); @@ -371,7 +372,7 @@ mod tests { async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { let recorder = Arc::new(RecorderWriter::default()); let ctx = test_ctx(); - let mut writer = BufferedWriter::new(recorder, || BytesMut::with_capacity(2), &ctx); + let mut writer = BufferedWriter::new(recorder, || IoBufferMut::with_capacity(2), &ctx); write!(writer, b"a"); write!(writer, b"bc"); write!(writer, b"d"); @@ -389,8 +390,11 @@ mod tests { let ctx = test_ctx(); let ctx = &ctx; let recorder = Arc::new(RecorderWriter::default()); - let mut writer = - BufferedWriter::<_, RecorderWriter>::new(recorder, || BytesMut::with_capacity(2), ctx); + let mut writer = BufferedWriter::<_, RecorderWriter>::new( + recorder, + || IoBufferMut::with_capacity(2), + ctx, + ); writer.write_buffered_borrowed(b"abc", ctx).await?; writer.write_buffered_borrowed(b"d", ctx).await?; From ce7cd361003b3769131e67191c482f7bb74edf49 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 12 Nov 2024 01:17:52 +0000 Subject: [PATCH 11/35] add IoBufAligned marker Signed-off-by: Yuchen Liang --- .../tenant/remote_timeline_client/download.rs | 20 +++++++++---------- pageserver/src/virtual_file.rs | 12 +++++------ .../owned_buffers_io/aligned_buffer/buffer.rs | 10 +++++++++- .../owned_buffers_io/io_buf_aligned.rs | 8 ++++++-- .../virtual_file/owned_buffers_io/write.rs | 20 +++++++++++-------- .../owned_buffers_io/write/flush.rs | 10 ++++++---- 6 files changed, 49 insertions(+), 31 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index c39a95d45278..e9230c58b627 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -27,9 +27,7 @@ use crate::span::{ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; -#[cfg_attr(target_os = "macos", allow(unused_imports))] -use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; -use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; +use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath}; use utils::crashsafe::path_with_suffix_extension; @@ -205,7 +203,6 @@ async fn download_object<'a>( #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { use crate::virtual_file::owned_buffers_io; - use bytes::BytesMut; async { let destination_file = Arc::new( VirtualFile::create(dst_path, ctx) @@ -225,11 +222,12 @@ async fn download_object<'a>( // TODO: use vectored write (writev) once supported by tokio-epoll-uring. // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { - let mut buffered = owned_buffers_io::write::BufferedWriter::::new( - destination_file, - || BytesMut::with_capacity(super::BUFFER_SIZE), - ctx, - ); + let mut buffered = + owned_buffers_io::write::BufferedWriter::::new( + destination_file, + || IoBufferMut::with_capacity(super::BUFFER_SIZE), + ctx, + ); while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await { @@ -237,7 +235,9 @@ async fn download_object<'a>( Ok(chunk) => chunk, Err(e) => return Err(e), }; - buffered.write_buffered(chunk.slice_len(), ctx).await?; + // TODO(yuchen): might have performance issue when using borrowed version? + // Problem: input is Bytes, does not satisify IO alignment requirement. + buffered.write_buffered_borrowed(&chunk, ctx).await?; } let inner = buffered.flush_and_into_inner(ctx).await?; Ok(inner) diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 40e0e97b0ca3..5a4f387a33eb 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign}; -use owned_buffers_io::io_buf_aligned::IoBufAlignedMut; +use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; @@ -212,7 +212,7 @@ impl VirtualFile { self.inner.read_exact_at_page(page, offset, ctx).await } - pub async fn write_all_at( + pub async fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -1295,7 +1295,7 @@ impl Drop for VirtualFileInner { } impl OwnedAsyncWriter for VirtualFile { - async fn write_all_at( + async fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -1417,7 +1417,7 @@ mod tests { } } } - async fn write_all_at( + async fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -1619,10 +1619,10 @@ mod tests { ) .await?; file_b - .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx) + .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx) .await?; file_b - .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx) + .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx) .await?; assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 4474303b4ff5..ff5f2cd16336 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -3,7 +3,7 @@ use std::{ sync::Arc, }; -use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut}; +use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign}; /// An shared, immutable aligned buffer type. #[derive(Clone, Debug)] @@ -114,6 +114,14 @@ impl PartialEq<[u8]> for AlignedBuffer { } } +impl From<&[u8; N]> for AlignedBuffer> { + fn from(value: &[u8; N]) -> Self { + let mut buf = AlignedBufferMut::with_capacity(N); + buf.extend_from_slice(value); + buf.freeze() + } +} + /// SAFETY: the underlying buffer references a stable memory region. unsafe impl tokio_epoll_uring::IoBuf for AlignedBuffer { fn stable_ptr(&self) -> *const u8 { diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs index dba695196ebb..6ef016854852 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -1,9 +1,13 @@ -use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::{IoBuf, IoBufMut}; -use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf}; +use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf}; pub trait IoBufAlignedMut: IoBufMut {} +pub trait IoBufAligned: IoBuf {} + impl IoBufAlignedMut for IoBufferMut {} +impl IoBufAligned for IoBuffer {} + impl IoBufAlignedMut for PageWriteGuardBuf {} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 429138bc765d..d29c7af31627 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -10,12 +10,15 @@ use crate::{ virtual_file::{IoBuffer, IoBufferMut}, }; -use super::io_buf_ext::{FullSlice, IoBufExt}; +use super::{ + io_buf_aligned::IoBufAligned, + io_buf_ext::{FullSlice, IoBufExt}, +}; /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. pub trait OwnedAsyncWriter { - fn write_all_at( + fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -53,7 +56,7 @@ pub struct BufferedWriter { impl BufferedWriter where B: Buffer + Send + 'static, - Buf: IoBuf + Send + Sync + Clone, + Buf: IoBufAligned + Send + Sync + Clone, W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { pub fn new(writer: Arc, buf_new: impl Fn() -> B, ctx: &RequestContext) -> Self { @@ -110,8 +113,8 @@ where } /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted. - #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn write_buffered( + #[allow(dead_code)] + pub async fn write_buffered( &mut self, chunk: FullSlice, ctx: &RequestContext, @@ -259,7 +262,6 @@ impl Buffer for IoBufferMut { } fn extend_from_slice(&mut self, other: &[u8]) { - self.reserve(other.len()); IoBufferMut::extend_from_slice(self, other); } @@ -307,7 +309,7 @@ mod tests { } impl OwnedAsyncWriter for RecorderWriter { - async fn write_all_at( + async fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -327,8 +329,10 @@ mod tests { macro_rules! write { ($writer:ident, $data:literal) => {{ + let mut buf = crate::virtual_file::IoBufferMut::with_capacity(2); + buf.extend_from_slice($data); $writer - .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx()) + .write_buffered(buf.freeze().slice_len(), &test_ctx()) .await?; }}; } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index a9c1404712fc..49ad954725c7 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -1,9 +1,11 @@ use std::sync::Arc; use tokio::sync::mpsc; -use tokio_epoll_uring::IoBuf; -use crate::{context::RequestContext, virtual_file::owned_buffers_io::io_buf_ext::FullSlice}; +use crate::{ + context::RequestContext, + virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice}, +}; use super::{Buffer, OwnedAsyncWriter}; @@ -68,7 +70,7 @@ pub struct FlushBackgroundTask { impl FlushBackgroundTask where - Buf: IoBuf + Send + Sync, + Buf: IoBufAligned + Send + Sync, W: OwnedAsyncWriter + Sync + 'static, { fn new( @@ -105,7 +107,7 @@ where impl FlushHandle where - Buf: IoBuf + Send + Sync + Clone, + Buf: IoBufAligned + Send + Sync + Clone, W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { /// Spawns a new background flush task and obtains a handle. From 20e6a0c8a2170cfe8ea51609e2949a727b9fc22d Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 12 Nov 2024 01:26:29 +0000 Subject: [PATCH 12/35] use open_with_options_v2 (O_DIRECT) for ephemeral file Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 9979d1980fce..ffa37a587d2d 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -52,7 +52,7 @@ impl EphemeralFile { ))); let file = Arc::new( - VirtualFile::open_with_options( + VirtualFile::open_with_options_v2( &filename, virtual_file::OpenOptions::new() .read(true) From ffd88ede3844aa66ca85ba3f2f69ee5d96e67e03 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 12 Nov 2024 15:59:24 +0000 Subject: [PATCH 13/35] fix clippy Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 2 +- .../owned_buffers_io/aligned_buffer/buffer_mut.rs | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index ffa37a587d2d..ea2b5809c128 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -410,7 +410,7 @@ mod tests { } let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap(); - assert!(file_contents == &content[0..cap] || file_contents == &content[0..cap * 2]); + assert!(file_contents == content[0..cap] || file_contents == content[0..cap * 2]); let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap(); assert_eq!(maybe_flushed_buffer_contents, &content[cap..cap * 2]); diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index 08eafcb07726..af8f463ba722 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -141,6 +141,7 @@ impl AlignedBufferMut { let cnt = extend.len(); self.reserve(cnt); + // SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy. unsafe { let dst = self.spare_capacity_mut(); // Reserved above @@ -148,7 +149,7 @@ impl AlignedBufferMut { core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt); } - + // SAFETY: We do have at least `cnt` bytes remaining before advance. unsafe { bytes::BufMut::advance_mut(self, cnt); } @@ -156,6 +157,8 @@ impl AlignedBufferMut { #[inline] fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit] { + // SAFETY: we guarantees that the `Self::capacity()` bytes from + // `Self::as_mut_ptr()` are allocated. unsafe { let ptr = self.as_mut_ptr().add(self.len()); let len = self.capacity() - self.len(); From 6844b5f460b4058f6e92eaa758f2828f6bb0ba31 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 12 Nov 2024 16:52:46 +0000 Subject: [PATCH 14/35] add comments; make read buffering works with write_buffered (owned version) Signed-off-by: Yuchen Liang --- .../owned_buffers_io/aligned_buffer/buffer.rs | 2 ++ .../aligned_buffer/buffer_mut.rs | 3 ++ .../owned_buffers_io/io_buf_aligned.rs | 2 ++ .../virtual_file/owned_buffers_io/write.rs | 28 +++++++++++++---- .../owned_buffers_io/write/flush.rs | 30 +++++++++++++++---- 5 files changed, 54 insertions(+), 11 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index ff5f2cd16336..a5c26cd7463a 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -88,6 +88,8 @@ impl AlignedBuffer { } } + /// Returns the mutable aligned buffer, if the immutable aligned buffer + /// has exactly one strong reference. Otherwise returns `None`. pub fn into_mut(self) -> Option> { let raw = Arc::into_inner(self.raw)?; Some(AlignedBufferMut::from_raw(raw)) diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index af8f463ba722..d2f5e206bb09 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -49,6 +49,7 @@ impl AlignedBufferMut> { } impl AlignedBufferMut { + /// Constructs a mutable aligned buffer from raw. pub(super) fn from_raw(raw: RawAlignedBuffer) -> Self { AlignedBufferMut { raw } } @@ -136,6 +137,7 @@ impl AlignedBufferMut { AlignedBuffer::from_raw(self.raw, 0..len) } + /// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed. #[inline] pub fn extend_from_slice(&mut self, extend: &[u8]) { let cnt = extend.len(); @@ -155,6 +157,7 @@ impl AlignedBufferMut { } } + /// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit`. #[inline] fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit] { // SAFETY: we guarantees that the `Self::capacity()` bytes from diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs index 6ef016854852..4ea6b1774447 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -2,8 +2,10 @@ use tokio_epoll_uring::{IoBuf, IoBufMut}; use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf}; +/// A marker trait for a mutable aligned buffer type. pub trait IoBufAlignedMut: IoBufMut {} +/// A marker trait for an aligned buffer type. pub trait IoBufAligned: IoBuf {} impl IoBufAlignedMut for IoBufferMut {} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index d29c7af31627..7566624275cf 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -17,6 +17,7 @@ use super::{ /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. +/// The owned buffers need to be aligned due to Direct IO requirements. pub trait OwnedAsyncWriter { fn write_all_at( &self, @@ -49,7 +50,9 @@ pub struct BufferedWriter { /// /// In these exceptional cases, it's `None`. mutable: Option, + /// A handle to the background flush task for writting data to disk. flush_handle: FlushHandle, + /// The number of bytes submitted to the background task. bytes_amount: u64, } @@ -59,6 +62,9 @@ where Buf: IoBufAligned + Send + Sync + Clone, W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { + /// Creates a new buffered writer. + /// + /// The `buf_new` function provides a way to initialize the owned buffers used by this writer. pub fn new(writer: Arc, buf_new: impl Fn() -> B, ctx: &RequestContext) -> Self { Self { writer: writer.clone(), @@ -72,6 +78,7 @@ where &self.writer } + /// Returns the number of bytes submitted to the background flush task. pub fn bytes_written(&self) -> u64 { self.bytes_amount } @@ -82,6 +89,7 @@ where } /// Gets a reference to the maybe flushed read-only buffer. + /// Returns `None` if the writer has not submitted any flush request. pub fn inspect_maybe_flushed(&self) -> Option<&Buf> { self.flush_handle.maybe_flushed.as_ref() } @@ -91,7 +99,7 @@ where mut self, ctx: &RequestContext, ) -> std::io::Result<(u64, Arc)> { - self.flush(ctx).await?; + self.flush(true, ctx).await?; let Self { mutable: buf, @@ -125,7 +133,7 @@ where // avoid memcpy for the middle of the chunk if chunk.len() >= self.mutable().cap() { // TODO(yuchen): do we still want to keep the bypass path? - self.flush(ctx).await?; + self.flush(false, ctx).await?; // do a big write, bypassing `buf` assert_eq!( self.mutable @@ -156,7 +164,7 @@ where slice = &slice[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); - self.flush(ctx).await?; + self.flush(true, ctx).await?; } } assert!(slice.is_empty(), "by now we should have drained the chunk"); @@ -183,20 +191,27 @@ where chunk = &chunk[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); - self.flush(ctx).await?; + self.flush(true, ctx).await?; } } Ok(chunk_len) } - async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result<()> { + async fn flush( + &mut self, + save_buf_for_read: bool, + _ctx: &RequestContext, + ) -> std::io::Result<()> { let buf = self.mutable.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { self.mutable = Some(buf); return Ok(()); } - let recycled = self.flush_handle.flush(buf, self.bytes_amount).await?; + let recycled = self + .flush_handle + .flush(buf, self.bytes_amount, save_buf_for_read) + .await?; self.bytes_amount += u64::try_from(buf_len).unwrap(); self.mutable = Some(recycled); Ok(()) @@ -273,6 +288,7 @@ impl Buffer for IoBufferMut { self.freeze().slice_len() } + /// Caller should make sure that `iobuf` only have one strong reference before invoking this method. fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { let mut recycled = iobuf .into_mut() diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 49ad954725c7..cb4d90bfce2f 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -56,14 +56,17 @@ pub struct FlushHandleInner { /// A handle to the flush task. pub struct FlushHandle { inner: Option>, - /// Buffer for serving tail reads. + /// Immutable buffer for serving tail reads. + /// `None` if no flush request has been submitted. pub(super) maybe_flushed: Option, } +/// A background task for flushing data to disk. pub struct FlushBackgroundTask { /// A bi-directional channel that receives (buffer, offset) for writes, /// and send back recycled buffer. channel: Duplex, (FullSlice, u64)>, + /// A writter for persisting data to disk. writer: Arc, ctx: RequestContext, } @@ -73,6 +76,7 @@ where Buf: IoBufAligned + Send + Sync, W: OwnedAsyncWriter + Sync + 'static, { + /// Creates a new background flush task. fn new( channel: Duplex, (FullSlice, u64)>, file: Arc, @@ -87,14 +91,17 @@ where /// Runs the background flush task. async fn run(mut self, slice: FullSlice) -> std::io::Result> { + // Sends the extra buffer back to the handle. self.channel.send(slice).await.map_err(|_| { std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early") })?; // Exit condition: channel is closed and there is no remaining buffer to be flushed while let Some((slice, offset)) = self.channel.recv().await { + // Write slice to disk at `offset`. let slice = self.writer.write_all_at(slice, offset, &self.ctx).await?; + // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer. if self.channel.send(slice).await.is_err() { // Although channel is closed. Still need to finish flushing the remaining buffers. continue; @@ -131,17 +138,28 @@ where /// Submits a buffer to be flushed in the background task. /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged. - pub async fn flush(&mut self, buf: B, offset: u64) -> std::io::Result + /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise + /// clear `maybe_flushed`. + pub async fn flush( + &mut self, + buf: B, + offset: u64, + save_buf_for_read: bool, + ) -> std::io::Result where B: Buffer + Send + 'static, { let freezed = buf.flush(); - self.maybe_flushed - .replace(freezed.as_raw_slice().get_ref().clone()); + // Saves a buffer for read while flushing. This also removes reference to the old buffer. + self.maybe_flushed = if save_buf_for_read { + Some(freezed.as_raw_slice().get_ref().clone()) + } else { + None + }; + // Submits the buffer to the background task. let submit = self.inner_mut().channel.send((freezed, offset)).await; - if submit.is_err() { return self.handle_error().await; } @@ -168,6 +186,8 @@ where handle.join_handle.await.unwrap() } + /// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`. + /// This only happens if the handle is used after an error. fn inner_mut(&mut self) -> &mut FlushHandleInner { self.inner .as_mut() From 990bc65a205d76dd2ec2b766e102800155d4c1e3 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Fri, 15 Nov 2024 15:42:58 +0000 Subject: [PATCH 15/35] review: https://github.com/neondatabase/neon/pull/9693#discussion_r1840293759 Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 18 +++++++++--------- .../src/virtual_file/owned_buffers_io/write.rs | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index ea2b5809c128..1a7becd6f780 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -167,7 +167,7 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst: tokio_epoll_uring::Slice, ctx: &'a RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { - let flushed_offset = self.buffered_writer.bytes_written(); + let submitted_offset = self.buffered_writer.bytes_submitted(); let mutable = self.buffered_writer.inspect_mutable(); let mutable = &mutable[0..mutable.pending()]; @@ -208,23 +208,23 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral ( Range( start, - std::cmp::min(end, flushed_offset.saturating_sub(TAIL_SZ as u64)), + std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)), ), Range( - std::cmp::max(start, flushed_offset.saturating_sub(TAIL_SZ as u64)), - std::cmp::min(end, flushed_offset), + std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)), + std::cmp::min(end, submitted_offset), ), ) } else { ( - Range(start, std::cmp::min(end, flushed_offset)), + Range(start, std::cmp::min(end, submitted_offset)), // zero len - Range(flushed_offset, u64::MIN), + Range(submitted_offset, u64::MIN), ) } }; - let mutable_range = Range(std::cmp::max(start, flushed_offset), end); + let mutable_range = Range(std::cmp::max(start, submitted_offset), end); let dst = if written_range.len() > 0 { let file: &VirtualFile = self.buffered_writer.as_inner(); @@ -240,7 +240,7 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral let dst = if maybe_flushed_range.len() > 0 { let offset_in_buffer = maybe_flushed_range .0 - .checked_sub(flushed_offset.saturating_sub(TAIL_SZ as u64)) + .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64)) .unwrap() .into_usize(); // Checked previously the buffer is Some. @@ -265,7 +265,7 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral let dst = if mutable_range.len() > 0 { let offset_in_buffer = mutable_range .0 - .checked_sub(flushed_offset) + .checked_sub(submitted_offset) .unwrap() .into_usize(); let to_copy = diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 7566624275cf..edfaca5a79db 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -53,7 +53,7 @@ pub struct BufferedWriter { /// A handle to the background flush task for writting data to disk. flush_handle: FlushHandle, /// The number of bytes submitted to the background task. - bytes_amount: u64, + bytes_submitted: u64, } impl BufferedWriter @@ -70,7 +70,7 @@ where writer: writer.clone(), mutable: Some(buf_new()), flush_handle: FlushHandle::spawn_new(writer, buf_new(), ctx.attached_child()), - bytes_amount: 0, + bytes_submitted: 0, } } @@ -79,8 +79,8 @@ where } /// Returns the number of bytes submitted to the background flush task. - pub fn bytes_written(&self) -> u64 { - self.bytes_amount + pub fn bytes_submitted(&self) -> u64 { + self.bytes_submitted } /// Panics if used after any of the write paths returned an error @@ -105,7 +105,7 @@ where mutable: buf, writer, mut flush_handle, - bytes_amount, + bytes_submitted: bytes_amount, } = self; flush_handle.shutdown().await?; assert!(buf.is_some()); @@ -145,11 +145,11 @@ where let chunk = OwnedAsyncWriter::write_all_at( self.writer.as_ref(), FullSlice::must_new(chunk), - self.bytes_amount, + self.bytes_submitted, ctx, ) .await?; - self.bytes_amount += u64::try_from(chunk_len).unwrap(); + self.bytes_submitted += u64::try_from(chunk_len).unwrap(); return Ok((chunk_len, chunk)); } // in-memory copy the < BUFFER_SIZED tail of the chunk @@ -210,9 +210,9 @@ where } let recycled = self .flush_handle - .flush(buf, self.bytes_amount, save_buf_for_read) + .flush(buf, self.bytes_submitted, save_buf_for_read) .await?; - self.bytes_amount += u64::try_from(buf_len).unwrap(); + self.bytes_submitted += u64::try_from(buf_len).unwrap(); self.mutable = Some(recycled); Ok(()) } From 5acc61bdbcda38c93e6825da91a18706d3f1a7f3 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 18 Nov 2024 23:52:52 +0000 Subject: [PATCH 16/35] move duplex to utils; make flush behavior controllable in test Signed-off-by: Yuchen Liang --- libs/utils/src/sync.rs | 1 + libs/utils/src/sync/duplex.rs | 1 + libs/utils/src/sync/duplex/mpsc.rs | 36 +++ pageserver/src/tenant/ephemeral_file.rs | 112 ++++--- .../virtual_file/owned_buffers_io/write.rs | 48 ++- .../owned_buffers_io/write/flush.rs | 281 ++++++++++++------ 6 files changed, 344 insertions(+), 135 deletions(-) create mode 100644 libs/utils/src/sync/duplex.rs create mode 100644 libs/utils/src/sync/duplex/mpsc.rs diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs index 2ee8f3544911..064c675a16ef 100644 --- a/libs/utils/src/sync.rs +++ b/libs/utils/src/sync.rs @@ -1,3 +1,4 @@ pub mod heavier_once_cell; +pub mod duplex; pub mod gate; diff --git a/libs/utils/src/sync/duplex.rs b/libs/utils/src/sync/duplex.rs new file mode 100644 index 000000000000..fac79297a086 --- /dev/null +++ b/libs/utils/src/sync/duplex.rs @@ -0,0 +1 @@ +pub mod mpsc; diff --git a/libs/utils/src/sync/duplex/mpsc.rs b/libs/utils/src/sync/duplex/mpsc.rs new file mode 100644 index 000000000000..56b4e6d2b331 --- /dev/null +++ b/libs/utils/src/sync/duplex/mpsc.rs @@ -0,0 +1,36 @@ +use tokio::sync::mpsc; + +/// A bi-directional channel. +pub struct Duplex { + pub tx: mpsc::Sender, + pub rx: mpsc::Receiver, +} + +/// Creates a bi-directional channel. +/// +/// The channel will buffer up to the provided number of messages. Once the buffer is full, +/// attempts to send new messages will wait until a message is received from the channel. +/// The provided buffer capacity must be at least 1. +pub fn channel(buffer: usize) -> (Duplex, Duplex) { + let (tx_a, rx_a) = mpsc::channel::(buffer); + let (tx_b, rx_b) = mpsc::channel::(buffer); + + (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a }) +} + +impl Duplex { + /// Sends a value, waiting until there is capacity. + /// + /// A successful send occurs when it is determined that the other end of the channel has not hung up already. + pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError> { + self.tx.send(x).await + } + + /// Receives the next value for this receiver. + /// + /// This method returns `None` if the channel has been closed and there are + /// no remaining messages in the channel's buffer. + pub async fn recv(&mut self) -> Option { + self.rx.recv().await + } +} diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 1a7becd6f780..53777fc352df 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -131,6 +131,18 @@ impl EphemeralFile { srcbuf: &[u8], ctx: &RequestContext, ) -> std::io::Result { + let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?; + if let Some(control) = control { + control.release().await; + } + Ok(pos) + } + + pub(crate) async fn write_raw_controlled( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> std::io::Result<(u64, Option)> { let pos = self.bytes_written; let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| { @@ -144,9 +156,9 @@ impl EphemeralFile { })?; // Write the payload - let nwritten = self + let (nwritten, control) = self .buffered_writer - .write_buffered_borrowed(srcbuf, ctx) + .write_buffered_borrowed_controlled(srcbuf, ctx) .await?; assert_eq!( nwritten, @@ -156,7 +168,7 @@ impl EphemeralFile { self.bytes_written = new_bytes_written; - Ok(pos) + Ok((pos, control)) } } @@ -381,7 +393,9 @@ mod tests { .await .unwrap(); - let cap = file.buffered_writer.inspect_mutable().capacity(); + let mutable = file.buffered_writer.inspect_mutable(); + let cap = mutable.capacity(); + let align = mutable.align(); let write_nbytes = cap * 2 + cap / 2; @@ -391,26 +405,33 @@ mod tests { .collect(); let mut value_offsets = Vec::new(); - for i in 0..write_nbytes { - let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap(); + for range in (0..write_nbytes) + .step_by(align) + .map(|start| start..(start + align).min(write_nbytes)) + { + let off = file.write_raw(&content[range], &ctx).await.unwrap(); value_offsets.push(off); } - assert!(file.len() as usize == write_nbytes); - for i in 0..write_nbytes { - assert_eq!(value_offsets[i], i.into_u64()); - let buf = IoBufferMut::with_capacity(1); + assert_eq!(file.len() as usize, write_nbytes); + for (i, range) in (0..write_nbytes) + .step_by(align) + .map(|start| start..(start + align).min(write_nbytes)) + .enumerate() + { + assert_eq!(value_offsets[i], range.start.into_u64()); + let buf = IoBufferMut::with_capacity(range.len()); let (buf_slice, nread) = file - .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx) + .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx) .await .unwrap(); let buf = buf_slice.into_inner(); - assert_eq!(nread, 1); - assert_eq!(&buf, &content[i..i + 1]); + assert_eq!(nread, range.len()); + assert_eq!(&buf, &content[range]); } let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap(); - assert!(file_contents == content[0..cap] || file_contents == content[0..cap * 2]); + assert!(file_contents == content[0..cap * 2]); let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap(); assert_eq!(maybe_flushed_buffer_contents, &content[cap..cap * 2]); @@ -430,7 +451,7 @@ mod tests { .await .unwrap(); - // mutable buffer and maybe_flushed buffer each has cap. + // mutable buffer and maybe_flushed buffer each has `cap` bytes. let cap = file.buffered_writer.inspect_mutable().capacity(); let content: Vec = rand::thread_rng() @@ -446,8 +467,9 @@ mod tests { &content[0..cap * 2 + cap / 2] ); let md = file.buffered_writer.as_inner().path().metadata().unwrap(); - assert!( - md.len() == cap.into_u64() || md.len() == 2 * cap.into_u64(), + assert_eq!( + md.len(), + 2 * cap.into_u64(), "buffered writer requires one write to be flushed if we write 2.5x buffer capacity" ); assert_eq!( @@ -477,14 +499,15 @@ mod tests { .await .unwrap(); - let cap = file.buffered_writer.inspect_mutable().capacity(); - + let mutable = file.buffered_writer.inspect_mutable(); + let cap = mutable.capacity(); + let align = mutable.align(); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) .take(cap * 2 + cap / 2) .collect(); - file.write_raw(&content, &ctx).await.unwrap(); + let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap(); let test_read = |start: usize, len: usize| { let file = &file; @@ -504,24 +527,37 @@ mod tests { } }; + let test_read_all_offset_combinations = || { + async move { + test_read(align, align).await; + // border onto edge of file + test_read(cap - align, align).await; + // read across file and buffer + test_read(cap - align, 2 * align).await; + // stay from start of maybe flushed buffer + test_read(cap, align).await; + // completely within maybe flushed buffer + test_read(cap + align, align).await; + // border onto edge of maybe flushed buffer. + test_read(cap * 2 - align, align).await; + // read across maybe flushed and mutable buffer + test_read(cap * 2 - align, 2 * align).await; + // read across three segments + test_read(cap - align, cap + 2 * align).await; + // completely within mutable buffer + test_read(cap * 2 + align, align).await; + } + }; + // completely within the file range - assert!(20 < cap, "test assumption"); - test_read(10, 10).await; - // border onto edge of file - test_read(cap - 10, 10).await; - // read across file and buffer - test_read(cap - 10, 20).await; - // stay from start of maybe flushed buffer - test_read(cap, 10).await; - // completely within maybe flushed buffer - test_read(cap + 10, 10).await; - // border onto edge of maybe flushed buffer. - test_read(cap * 2 - 10, 10).await; - // read across maybe flushed and mutable buffer - test_read(cap * 2 - 10, 20).await; - // read across three segments - test_read(cap - 10, cap + 20).await; - // completely within mutable buffer - test_read(cap * 2 + 10, 10).await; + assert!(align < cap, "test assumption"); + assert!(cap % align == 0); + let not_started = control.unwrap().as_not_started(); + + test_read_all_offset_combinations().await; + let in_progress = not_started.ready_to_flush(); + test_read_all_offset_combinations().await; + in_progress.wait_until_flush_is_done().await; + test_read_all_offset_combinations().await; } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index edfaca5a79db..3e716cb3e507 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -15,6 +15,8 @@ use super::{ io_buf_ext::{FullSlice, IoBufExt}, }; +pub(crate) use flush::FlushControl; + /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. /// The owned buffers need to be aligned due to Direct IO requirements. @@ -133,7 +135,10 @@ where // avoid memcpy for the middle of the chunk if chunk.len() >= self.mutable().cap() { // TODO(yuchen): do we still want to keep the bypass path? - self.flush(false, ctx).await?; + let control = self.flush(false, ctx).await?; + if let Some(control) = control { + control.release().await; + } // do a big write, bypassing `buf` assert_eq!( self.mutable @@ -155,7 +160,11 @@ where // in-memory copy the < BUFFER_SIZED tail of the chunk assert!(chunk.len() < self.mutable().cap()); let mut slice = &chunk[..]; + let mut control: Option = None; while !slice.is_empty() { + if let Some(control) = control.take() { + control.release().await; + } let buf = self.mutable.as_mut().expect("must not use after an error"); let need = buf.cap() - buf.pending(); let have = slice.len(); @@ -164,9 +173,12 @@ where slice = &slice[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); - self.flush(true, ctx).await?; + control = self.flush(true, ctx).await?; } } + if let Some(control) = control.take() { + control.release().await; + } assert!(slice.is_empty(), "by now we should have drained the chunk"); Ok((chunk_len, FullSlice::must_new(chunk))) } @@ -178,10 +190,24 @@ where /// for large writes. pub async fn write_buffered_borrowed( &mut self, - mut chunk: &[u8], + chunk: &[u8], ctx: &RequestContext, ) -> std::io::Result { + let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?; + if let Some(control) = control { + control.release().await; + } + Ok(len) + } + + /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior. + pub async fn write_buffered_borrowed_controlled( + &mut self, + mut chunk: &[u8], + ctx: &RequestContext, + ) -> std::io::Result<(usize, Option)> { let chunk_len = chunk.len(); + let mut control: Option = None; while !chunk.is_empty() { let buf = self.mutable.as_mut().expect("must not use after an error"); let need = buf.cap() - buf.pending(); @@ -191,30 +217,34 @@ where chunk = &chunk[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); - self.flush(true, ctx).await?; + if let Some(control) = control.take() { + control.release().await; + } + control = self.flush(true, ctx).await?; } } - Ok(chunk_len) + Ok((chunk_len, control)) } + #[must_use] async fn flush( &mut self, save_buf_for_read: bool, _ctx: &RequestContext, - ) -> std::io::Result<()> { + ) -> std::io::Result> { let buf = self.mutable.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { self.mutable = Some(buf); - return Ok(()); + return Ok(None); } - let recycled = self + let (recycled, flush_control) = self .flush_handle .flush(buf, self.bytes_submitted, save_buf_for_read) .await?; self.bytes_submitted += u64::try_from(buf_len).unwrap(); self.mutable = Some(recycled); - Ok(()) + Ok(Some(flush_control)) } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index cb4d90bfce2f..249c32e41084 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use tokio::sync::mpsc; +use utils::sync::duplex; use crate::{ context::RequestContext, @@ -9,106 +9,105 @@ use crate::{ use super::{Buffer, OwnedAsyncWriter}; -/// A bi-directional channel. -pub struct Duplex { - pub tx: mpsc::Sender, - pub rx: mpsc::Receiver, -} - -/// Creates a bi-directional channel. -/// -/// The channel will buffer up to the provided number of messages. Once the buffer is full, -/// attempts to send new messages will wait until a message is received from the channel. -/// The provided buffer capacity must be at least 1. -pub fn duplex_channel(buffer: usize) -> (Duplex, Duplex) { - let (tx_a, rx_a) = mpsc::channel::(buffer); - let (tx_b, rx_b) = mpsc::channel::(buffer); - - (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a }) -} - -impl Duplex { - /// Sends a value, waiting until there is capacity. - /// - /// A successful send occurs when it is determined that the other end of the channel has not hung up already. - pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError> { - self.tx.send(x).await - } - - /// Receives the next value for this receiver. - /// - /// This method returns `None` if the channel has been closed and there are - /// no remaining messages in the channel's buffer. - pub async fn recv(&mut self) -> Option { - self.rx.recv().await - } +/// A handle to the flush task. +pub struct FlushHandle { + inner: Option>, + /// Immutable buffer for serving tail reads. + /// `None` if no flush request has been submitted. + pub(super) maybe_flushed: Option, } // TODO(yuchen): special actions in drop to clean up the join handle? pub struct FlushHandleInner { /// A bi-directional channel that sends (buffer, offset) for writes, /// and receives recyled buffer. - channel: Duplex<(FullSlice, u64), FullSlice>, + channel: duplex::mpsc::Duplex, FullSlice>, /// Join handle for the background flush task. join_handle: tokio::task::JoinHandle>>, } -/// A handle to the flush task. -pub struct FlushHandle { - inner: Option>, - /// Immutable buffer for serving tail reads. - /// `None` if no flush request has been submitted. - pub(super) maybe_flushed: Option, +struct FlushRequest { + slice: FullSlice, + offset: u64, + #[cfg(test)] + ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>, + #[cfg(test)] + done_flush_tx: tokio::sync::oneshot::Sender<()>, } -/// A background task for flushing data to disk. -pub struct FlushBackgroundTask { - /// A bi-directional channel that receives (buffer, offset) for writes, - /// and send back recycled buffer. - channel: Duplex, (FullSlice, u64)>, - /// A writter for persisting data to disk. - writer: Arc, - ctx: RequestContext, +#[cfg(not(test))] +fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { + let request = FlushRequest { slice, offset }; + let control = FlushControl::untracked(); + + (request, control) } -impl FlushBackgroundTask -where - Buf: IoBufAligned + Send + Sync, - W: OwnedAsyncWriter + Sync + 'static, -{ - /// Creates a new background flush task. - fn new( - channel: Duplex, (FullSlice, u64)>, - file: Arc, - ctx: RequestContext, +#[cfg(test)] +fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { + let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel(); + let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel(); + let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx); + + let request = FlushRequest { + slice, + offset, + ready_to_flush_rx, + done_flush_tx, + }; + (request, control) +} + +pub enum FlushStartState { + #[cfg(not(test))] + Untracked, + #[cfg(test)] + NotStarted(FlushNotStarted), +} + +pub(crate) struct FlushControl { + state: FlushStartState, +} + +impl FlushControl { + #[cfg(test)] + fn not_started( + ready_to_flush_tx: tokio::sync::oneshot::Sender<()>, + done_flush_rx: tokio::sync::oneshot::Receiver<()>, ) -> Self { - FlushBackgroundTask { - channel, - writer: file, - ctx, + FlushControl { + state: FlushStartState::NotStarted(FlushNotStarted { + ready_to_flush_tx, + done_flush_rx, + }), + } + } + #[cfg(not(test))] + fn untracked() -> Self { + FlushControl { + state: FlushStartState::Untracked, } } - /// Runs the background flush task. - async fn run(mut self, slice: FullSlice) -> std::io::Result> { - // Sends the extra buffer back to the handle. - self.channel.send(slice).await.map_err(|_| { - std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early") - })?; - - // Exit condition: channel is closed and there is no remaining buffer to be flushed - while let Some((slice, offset)) = self.channel.recv().await { - // Write slice to disk at `offset`. - let slice = self.writer.write_all_at(slice, offset, &self.ctx).await?; + #[cfg(test)] + pub(crate) fn as_not_started(self) -> FlushNotStarted { + match self.state { + FlushStartState::NotStarted(not_started) => not_started, + } + } - // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer. - if self.channel.send(slice).await.is_err() { - // Although channel is closed. Still need to finish flushing the remaining buffers. - continue; + pub async fn release(self) { + match self.state { + #[cfg(not(test))] + FlushStartState::Untracked => (), + #[cfg(test)] + FlushStartState::NotStarted(not_started) => { + not_started + .ready_to_flush() + .wait_until_flush_is_done() + .await; } } - - Ok(self.writer) } } @@ -122,7 +121,7 @@ where where B: Buffer + Send + 'static, { - let (front, back) = duplex_channel(2); + let (front, back) = duplex::mpsc::channel(1); let bg = FlushBackgroundTask::new(back, file, ctx); let join_handle = tokio::spawn(async move { bg.run(buf.flush()).await }); @@ -145,21 +144,23 @@ where buf: B, offset: u64, save_buf_for_read: bool, - ) -> std::io::Result + ) -> std::io::Result<(B, FlushControl)> where B: Buffer + Send + 'static, { - let freezed = buf.flush(); + let slice = buf.flush(); // Saves a buffer for read while flushing. This also removes reference to the old buffer. self.maybe_flushed = if save_buf_for_read { - Some(freezed.as_raw_slice().get_ref().clone()) + Some(slice.as_raw_slice().get_ref().clone()) } else { None }; + let (request, flush_control) = new_flush_op(slice, offset); + // Submits the buffer to the background task. - let submit = self.inner_mut().channel.send((freezed, offset)).await; + let submit = self.inner_mut().channel.send(request).await; if submit.is_err() { return self.handle_error().await; } @@ -171,9 +172,9 @@ where // The only other place that could hold a reference to the recycled buffer // is in `Self::maybe_flushed`, but we have already replace it with the new buffer. - Ok(Buffer::reuse_after_flush( - recycled.into_raw_slice().into_inner(), - )) + + let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner()); + Ok((recycled, flush_control)) } /// Cleans up the channel, join the flush task. @@ -198,3 +199,107 @@ where Err(self.shutdown().await.unwrap_err()) } } + +/// A background task for flushing data to disk. +pub struct FlushBackgroundTask { + /// A bi-directional channel that receives (buffer, offset) for writes, + /// and send back recycled buffer. + channel: duplex::mpsc::Duplex, FlushRequest>, + /// A writter for persisting data to disk. + writer: Arc, + ctx: RequestContext, +} + +impl FlushBackgroundTask +where + Buf: IoBufAligned + Send + Sync, + W: OwnedAsyncWriter + Sync + 'static, +{ + /// Creates a new background flush task. + fn new( + channel: duplex::mpsc::Duplex, FlushRequest>, + file: Arc, + ctx: RequestContext, + ) -> Self { + FlushBackgroundTask { + channel, + writer: file, + ctx, + } + } + + /// Runs the background flush task. + async fn run(mut self, slice: FullSlice) -> std::io::Result> { + // Sends the extra buffer back to the handle. + self.channel.send(slice).await.map_err(|_| { + std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early") + })?; + + // Exit condition: channel is closed and there is no remaining buffer to be flushed + while let Some(request) = self.channel.recv().await { + #[cfg(test)] + { + // In test, wait for control to signal that we are ready to flush. + if request.ready_to_flush_rx.await.is_err() { + tracing::debug!("control dropped"); + } + } + + // Write slice to disk at `offset`. + let slice = self + .writer + .write_all_at(request.slice, request.offset, &self.ctx) + .await?; + + #[cfg(test)] + { + // In test, tell control we are done flushing buffer. + if request.done_flush_tx.send(()).is_err() { + tracing::debug!("control dropped"); + } + } + + // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer. + if self.channel.send(slice).await.is_err() { + // Although channel is closed. Still need to finish flushing the remaining buffers. + continue; + } + } + + Ok(self.writer) + } +} + +#[cfg(test)] +pub(crate) struct FlushNotStarted { + ready_to_flush_tx: tokio::sync::oneshot::Sender<()>, + done_flush_rx: tokio::sync::oneshot::Receiver<()>, +} + +#[cfg(test)] +pub(crate) struct FlushInProgress { + done_flush_rx: tokio::sync::oneshot::Receiver<()>, +} + +#[cfg(test)] +pub(crate) struct FlushDone; + +#[cfg(test)] +impl FlushNotStarted { + pub fn ready_to_flush(self) -> FlushInProgress { + self.ready_to_flush_tx + .send(()) + .map(|_| FlushInProgress { + done_flush_rx: self.done_flush_rx, + }) + .unwrap() + } +} + +#[cfg(test)] +impl FlushInProgress { + pub async fn wait_until_flush_is_done(self) -> FlushDone { + self.done_flush_rx.await.unwrap(); + FlushDone + } +} From 9db6b1e3c8b173cc0715e9ad4706f76364d22652 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 19 Nov 2024 14:22:28 +0000 Subject: [PATCH 17/35] fix clippy Signed-off-by: Yuchen Liang --- libs/pageserver_api/src/models.rs | 2 +- pageserver/src/tenant/ephemeral_file.rs | 2 +- pageserver/src/virtual_file/owned_buffers_io/write.rs | 2 +- pageserver/src/virtual_file/owned_buffers_io/write/flush.rs | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0dfa1ba817ca..0487afeee2e5 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -987,7 +987,7 @@ pub mod virtual_file { impl IoMode { pub const fn preferred() -> Self { - Self::Buffered + Self::Direct } } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 53777fc352df..961407509eaf 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -552,7 +552,7 @@ mod tests { // completely within the file range assert!(align < cap, "test assumption"); assert!(cap % align == 0); - let not_started = control.unwrap().as_not_started(); + let not_started = control.unwrap().into_not_started(); test_read_all_offset_combinations().await; let in_progress = not_started.ready_to_flush(); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 3e716cb3e507..99e4dc4c766c 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -226,7 +226,7 @@ where Ok((chunk_len, control)) } - #[must_use] + #[must_use = "caller must explcitly check the flush control"] async fn flush( &mut self, save_buf_for_read: bool, diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 249c32e41084..84e569baf70e 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -89,8 +89,9 @@ impl FlushControl { } } + /// In tests, turn flush control into a not started state. #[cfg(test)] - pub(crate) fn as_not_started(self) -> FlushNotStarted { + pub(crate) fn into_not_started(self) -> FlushNotStarted { match self.state { FlushStartState::NotStarted(not_started) => not_started, } From 826e2395a857732a321f3876fb201bd5bb21a343 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 19 Nov 2024 16:48:29 +0000 Subject: [PATCH 18/35] add comments Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 5 +++-- pageserver/src/virtual_file/owned_buffers_io/write.rs | 2 +- .../src/virtual_file/owned_buffers_io/write/flush.rs | 11 +++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 961407509eaf..5f7e86f3640f 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -138,7 +138,7 @@ impl EphemeralFile { Ok(pos) } - pub(crate) async fn write_raw_controlled( + async fn write_raw_controlled( &mut self, srcbuf: &[u8], ctx: &RequestContext, @@ -552,8 +552,9 @@ mod tests { // completely within the file range assert!(align < cap, "test assumption"); assert!(cap % align == 0); - let not_started = control.unwrap().into_not_started(); + // test reads at different flush stages. + let not_started = control.unwrap().into_not_started(); test_read_all_offset_combinations().await; let in_progress = not_started.ready_to_flush(); test_read_all_offset_combinations().await; diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 99e4dc4c766c..84bf32b68419 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -201,7 +201,7 @@ where } /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior. - pub async fn write_buffered_borrowed_controlled( + pub(crate) async fn write_buffered_borrowed_controlled( &mut self, mut chunk: &[u8], ctx: &RequestContext, diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 84e569baf70e..585c70218bdb 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -35,6 +35,7 @@ struct FlushRequest { done_flush_tx: tokio::sync::oneshot::Sender<()>, } +/// Constructs a request and a control object for a new flush operation. #[cfg(not(test))] fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { let request = FlushRequest { slice, offset }; @@ -43,6 +44,7 @@ fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, (request, control) } +/// Constructs a request and a control object for a new flush operation. #[cfg(test)] fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel(); @@ -59,12 +61,15 @@ fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, } pub enum FlushStartState { + /// The submitted buffer's flush state is not tracked. #[cfg(not(test))] Untracked, + /// The submitted buffer has not been flushed to disk. #[cfg(test)] NotStarted(FlushNotStarted), } +/// A control object that manipulates buffer's flush behehavior. pub(crate) struct FlushControl { state: FlushStartState, } @@ -82,6 +87,7 @@ impl FlushControl { }), } } + #[cfg(not(test))] fn untracked() -> Self { FlushControl { @@ -97,6 +103,9 @@ impl FlushControl { } } + /// Release control to the submitted buffer. + /// + /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution. pub async fn release(self) { match self.state { #[cfg(not(test))] @@ -287,6 +296,7 @@ pub(crate) struct FlushDone; #[cfg(test)] impl FlushNotStarted { + /// Signals the background task the buffer is ready to flush to disk. pub fn ready_to_flush(self) -> FlushInProgress { self.ready_to_flush_tx .send(()) @@ -299,6 +309,7 @@ impl FlushNotStarted { #[cfg(test)] impl FlushInProgress { + /// Waits until background flush is done. pub async fn wait_until_flush_is_done(self) -> FlushDone { self.done_flush_rx.await.unwrap(); FlushDone From 78a17a70513a54f96504757929e99ccf2dba3579 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 19 Nov 2024 18:38:59 +0000 Subject: [PATCH 19/35] improve FullSlice semantics Signed-off-by: Yuchen Liang --- pageserver/src/tenant/ephemeral_file.rs | 17 +++++++++++------ .../virtual_file/owned_buffers_io/io_buf_ext.rs | 12 +++++++++--- .../src/virtual_file/owned_buffers_io/write.rs | 2 +- .../owned_buffers_io/write/flush.rs | 4 ++-- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 5f7e86f3640f..9aecfff384eb 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -210,13 +210,13 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral } } - // [ written ][ maybe_flushed ][ mutable ] - // |- TAIL_SZ -||- TAIL_SZ -| - // ^ - // `flushed_offset` - // let (written_range, maybe_flushed_range) = { if maybe_flushed.is_some() { + // [ written ][ maybe_flushed ][ mutable ] + // <- TAIL_SZ -><- TAIL_SZ -> + // ^ + // `submitted_offset` + // <++++++ on disk +++++++????????????????> ( Range( start, @@ -228,6 +228,11 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral ), ) } else { + // [ written ][ mutable ] + // <- TAIL_SZ -> + // ^ + // `submitted_offset` + // <++++++ on disk +++++++++++++++++++++++> ( Range(start, std::cmp::min(end, submitted_offset)), // zero len @@ -434,7 +439,7 @@ mod tests { assert!(file_contents == content[0..cap * 2]); let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap(); - assert_eq!(maybe_flushed_buffer_contents, &content[cap..cap * 2]); + assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]); let mutable_buffer_contents = file.buffered_writer.inspect_mutable(); assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]); diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index ede04d49570e..70d33d59bfe3 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -27,10 +27,16 @@ where let FullSlice { slice: s } = self; s } +} - pub(crate) fn as_raw_slice(&self) -> &Slice { - let FullSlice { slice: s } = &self; - s +impl Clone for FullSlice +where + B: IoBuf + Clone, +{ + fn clone(&self) -> Self { + Self { + slice: self.slice.get_ref().clone().slice_full(), + } } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 84bf32b68419..0f2e8a635cee 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -92,7 +92,7 @@ where /// Gets a reference to the maybe flushed read-only buffer. /// Returns `None` if the writer has not submitted any flush request. - pub fn inspect_maybe_flushed(&self) -> Option<&Buf> { + pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice> { self.flush_handle.maybe_flushed.as_ref() } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 585c70218bdb..4c2019076ea1 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -14,7 +14,7 @@ pub struct FlushHandle { inner: Option>, /// Immutable buffer for serving tail reads. /// `None` if no flush request has been submitted. - pub(super) maybe_flushed: Option, + pub(super) maybe_flushed: Option>, } // TODO(yuchen): special actions in drop to clean up the join handle? @@ -162,7 +162,7 @@ where // Saves a buffer for read while flushing. This also removes reference to the old buffer. self.maybe_flushed = if save_buf_for_read { - Some(slice.as_raw_slice().get_ref().clone()) + Some(slice.clone()) } else { None }; From 0f63c957a683f8771c8fc5bc30d2db9b29e4a58e Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Wed, 20 Nov 2024 16:18:21 +0000 Subject: [PATCH 20/35] document and reorder flush background task invokation sequence Signed-off-by: Yuchen Liang --- .../src/virtual_file/owned_buffers_io/write/flush.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 4c2019076ea1..bd45ed33a4ad 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -127,14 +127,19 @@ where W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { /// Spawns a new background flush task and obtains a handle. + /// + /// Note: The background task so we do not need to explicitly maintain a queue of buffers. pub fn spawn_new(file: Arc, buf: B, ctx: RequestContext) -> Self where B: Buffer + Send + 'static, { - let (front, back) = duplex::mpsc::channel(1); + let (front, back) = duplex::mpsc::channel(2); - let bg = FlushBackgroundTask::new(back, file, ctx); - let join_handle = tokio::spawn(async move { bg.run(buf.flush()).await }); + let join_handle = tokio::spawn(async move { + FlushBackgroundTask::new(back, file, ctx) + .run(buf.flush()) + .await + }); FlushHandle { inner: Some(FlushHandleInner { @@ -239,6 +244,7 @@ where } /// Runs the background flush task. + /// The passed in slice is immediately sent back to the flush handle through the duplex channel. async fn run(mut self, slice: FullSlice) -> std::io::Result> { // Sends the extra buffer back to the handle. self.channel.send(slice).await.map_err(|_| { From 54d253d51ac82b4f008323b0602be0d3a488e064 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 25 Nov 2024 04:21:05 +0000 Subject: [PATCH 21/35] review: change IoMode default back to Buffered Signed-off-by: Yuchen Liang --- libs/pageserver_api/src/models.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0487afeee2e5..0dfa1ba817ca 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -987,7 +987,7 @@ pub mod virtual_file { impl IoMode { pub const fn preferred() -> Self { - Self::Direct + Self::Buffered } } From e5bf2bec49da673c47168fa500222de38dbae245 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 25 Nov 2024 04:30:42 +0000 Subject: [PATCH 22/35] remove write_buffered; add notes for bypass-aligned-part-of-write Signed-off-by: Yuchen Liang --- .../virtual_file/owned_buffers_io/write.rs | 131 +----------------- 1 file changed, 3 insertions(+), 128 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 0f2e8a635cee..eca591494584 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -122,72 +122,8 @@ where .expect("must not use after we returned an error") } - /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted. - #[allow(dead_code)] - pub async fn write_buffered( - &mut self, - chunk: FullSlice, - ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let chunk = chunk.into_raw_slice(); - - let chunk_len = chunk.len(); - // avoid memcpy for the middle of the chunk - if chunk.len() >= self.mutable().cap() { - // TODO(yuchen): do we still want to keep the bypass path? - let control = self.flush(false, ctx).await?; - if let Some(control) = control { - control.release().await; - } - // do a big write, bypassing `buf` - assert_eq!( - self.mutable - .as_ref() - .expect("must not use after an error") - .pending(), - 0 - ); - let chunk = OwnedAsyncWriter::write_all_at( - self.writer.as_ref(), - FullSlice::must_new(chunk), - self.bytes_submitted, - ctx, - ) - .await?; - self.bytes_submitted += u64::try_from(chunk_len).unwrap(); - return Ok((chunk_len, chunk)); - } - // in-memory copy the < BUFFER_SIZED tail of the chunk - assert!(chunk.len() < self.mutable().cap()); - let mut slice = &chunk[..]; - let mut control: Option = None; - while !slice.is_empty() { - if let Some(control) = control.take() { - control.release().await; - } - let buf = self.mutable.as_mut().expect("must not use after an error"); - let need = buf.cap() - buf.pending(); - let have = slice.len(); - let n = std::cmp::min(need, have); - buf.extend_from_slice(&slice[..n]); - slice = &slice[n..]; - if buf.pending() >= buf.cap() { - assert_eq!(buf.pending(), buf.cap()); - control = self.flush(true, ctx).await?; - } - } - if let Some(control) = control.take() { - control.release().await; - } - assert!(slice.is_empty(), "by now we should have drained the chunk"); - Ok((chunk_len, FullSlice::must_new(chunk))) - } - - /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. - /// - /// It is less performant because we always have to copy the borrowed data into the internal buffer - /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant - /// for large writes. + /// TODO(yuchen): For large write, it is possible to implement buffer bypass for aligned parts of the write so that + /// we could avoid copying majority of the data into the internal buffer. pub async fn write_buffered_borrowed( &mut self, chunk: &[u8], @@ -373,68 +309,6 @@ mod tests { RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) } - macro_rules! write { - ($writer:ident, $data:literal) => {{ - let mut buf = crate::virtual_file::IoBufferMut::with_capacity(2); - buf.extend_from_slice($data); - $writer - .write_buffered(buf.freeze().slice_len(), &test_ctx()) - .await?; - }}; - } - - #[tokio::test] - async fn test_buffered_writes_only() -> std::io::Result<()> { - let recorder = Arc::new(RecorderWriter::default()); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); - let mut writer = BufferedWriter::new(recorder, || IoBufferMut::with_capacity(2), &ctx); - write!(writer, b"a"); - write!(writer, b"b"); - write!(writer, b"c"); - write!(writer, b"d"); - write!(writer, b"e"); - let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.get_writes(), - vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] - ); - Ok(()) - } - - #[tokio::test] - async fn test_passthrough_writes_only() -> std::io::Result<()> { - let recorder = Arc::new(RecorderWriter::default()); - let ctx = test_ctx(); - let mut writer = BufferedWriter::new(recorder, || IoBufferMut::with_capacity(2), &ctx); - write!(writer, b"abc"); - write!(writer, b"de"); - write!(writer, b""); - write!(writer, b"fghijk"); - let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.get_writes(), - vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] - ); - Ok(()) - } - - #[tokio::test] - async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { - let recorder = Arc::new(RecorderWriter::default()); - let ctx = test_ctx(); - let mut writer = BufferedWriter::new(recorder, || IoBufferMut::with_capacity(2), &ctx); - write!(writer, b"a"); - write!(writer, b"bc"); - write!(writer, b"d"); - write!(writer, b"e"); - let (_, recorder) = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.get_writes(), - vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] - ); - Ok(()) - } - #[tokio::test] async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { let ctx = test_ctx(); @@ -447,6 +321,7 @@ mod tests { ); writer.write_buffered_borrowed(b"abc", ctx).await?; + writer.write_buffered_borrowed(b"", ctx).await?; writer.write_buffered_borrowed(b"d", ctx).await?; writer.write_buffered_borrowed(b"e", ctx).await?; writer.write_buffered_borrowed(b"fg", ctx).await?; From 28718bfadcf0e60d89aac3edcf484b0646518423 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 25 Nov 2024 04:39:13 +0000 Subject: [PATCH 23/35] review: simplify FlushControl by using ZST for not(test) Signed-off-by: Yuchen Liang --- .../owned_buffers_io/write/flush.rs | 45 +++++++------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index bd45ed33a4ad..e8ebb464386c 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -60,20 +60,15 @@ fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, (request, control) } -pub enum FlushStartState { - /// The submitted buffer's flush state is not tracked. - #[cfg(not(test))] - Untracked, - /// The submitted buffer has not been flushed to disk. - #[cfg(test)] - NotStarted(FlushNotStarted), -} - -/// A control object that manipulates buffer's flush behehavior. +/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior. +#[cfg(test)] pub(crate) struct FlushControl { - state: FlushStartState, + not_started: FlushNotStarted, } +#[cfg(not(test))] +pub(crate) struct FlushControl; + impl FlushControl { #[cfg(test)] fn not_started( @@ -81,42 +76,34 @@ impl FlushControl { done_flush_rx: tokio::sync::oneshot::Receiver<()>, ) -> Self { FlushControl { - state: FlushStartState::NotStarted(FlushNotStarted { + not_started: FlushNotStarted { ready_to_flush_tx, done_flush_rx, - }), + }, } } #[cfg(not(test))] fn untracked() -> Self { - FlushControl { - state: FlushStartState::Untracked, - } + FlushControl } /// In tests, turn flush control into a not started state. #[cfg(test)] pub(crate) fn into_not_started(self) -> FlushNotStarted { - match self.state { - FlushStartState::NotStarted(not_started) => not_started, - } + self.not_started } /// Release control to the submitted buffer. /// /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution. pub async fn release(self) { - match self.state { - #[cfg(not(test))] - FlushStartState::Untracked => (), - #[cfg(test)] - FlushStartState::NotStarted(not_started) => { - not_started - .ready_to_flush() - .wait_until_flush_is_done() - .await; - } + #[cfg(test)] + { + self.not_started + .ready_to_flush() + .wait_until_flush_is_done() + .await; } } } From 76f0e4fd1de0c540baab1c98efa341e899d69e71 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 25 Nov 2024 04:42:02 +0000 Subject: [PATCH 24/35] review: remove save_buf_for_read Signed-off-by: Yuchen Liang --- .../src/virtual_file/owned_buffers_io/write.rs | 15 ++++----------- .../virtual_file/owned_buffers_io/write/flush.rs | 13 ++----------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index eca591494584..9914e196857b 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -101,7 +101,7 @@ where mut self, ctx: &RequestContext, ) -> std::io::Result<(u64, Arc)> { - self.flush(true, ctx).await?; + self.flush(ctx).await?; let Self { mutable: buf, @@ -156,28 +156,21 @@ where if let Some(control) = control.take() { control.release().await; } - control = self.flush(true, ctx).await?; + control = self.flush(ctx).await?; } } Ok((chunk_len, control)) } #[must_use = "caller must explcitly check the flush control"] - async fn flush( - &mut self, - save_buf_for_read: bool, - _ctx: &RequestContext, - ) -> std::io::Result> { + async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result> { let buf = self.mutable.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { self.mutable = Some(buf); return Ok(None); } - let (recycled, flush_control) = self - .flush_handle - .flush(buf, self.bytes_submitted, save_buf_for_read) - .await?; + let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?; self.bytes_submitted += u64::try_from(buf_len).unwrap(); self.mutable = Some(recycled); Ok(Some(flush_control)) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index e8ebb464386c..687a4fd17a0b 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -141,23 +141,14 @@ where /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged. /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise /// clear `maybe_flushed`. - pub async fn flush( - &mut self, - buf: B, - offset: u64, - save_buf_for_read: bool, - ) -> std::io::Result<(B, FlushControl)> + pub async fn flush(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)> where B: Buffer + Send + 'static, { let slice = buf.flush(); // Saves a buffer for read while flushing. This also removes reference to the old buffer. - self.maybe_flushed = if save_buf_for_read { - Some(slice.clone()) - } else { - None - }; + self.maybe_flushed = Some(slice.clone()); let (request, flush_control) = new_flush_op(slice, offset); From d4ebd5ccd31e61a1c2aca637f058fac74cf5cb40 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 25 Nov 2024 05:01:23 +0000 Subject: [PATCH 25/35] use CheapCloneForRead trait to prevent efficiency bugs Signed-off-by: Yuchen Liang --- .../owned_buffers_io/io_buf_ext.rs | 24 ++++++++++--------- .../virtual_file/owned_buffers_io/write.rs | 14 ++++++++++- .../owned_buffers_io/write/flush.rs | 6 ++--- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index 70d33d59bfe3..2cd5a5af35e2 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -5,6 +5,8 @@ use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use super::write::CheapCloneForRead; + /// The true owned equivalent for Rust [`slice`]. Use this for the write path. /// /// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`, @@ -29,17 +31,6 @@ where } } -impl Clone for FullSlice -where - B: IoBuf + Clone, -{ - fn clone(&self) -> Self { - Self { - slice: self.slice.get_ref().clone().slice_full(), - } - } -} - impl Deref for FullSlice where B: IoBuf, @@ -54,6 +45,17 @@ where } } +impl CheapCloneForRead for FullSlice +where + B: IoBuf + CheapCloneForRead, +{ + fn cheap_clone(&self) -> Self { + Self { + slice: self.slice.get_ref().cheap_clone().slice_full(), + } + } +} + pub(crate) trait IoBufExt { /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`. fn slice_len(self) -> FullSlice diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 9914e196857b..88db3b959c59 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -17,6 +17,18 @@ use super::{ pub(crate) use flush::FlushControl; +pub(crate) trait CheapCloneForRead { + /// Returns a cheap clone of the buffer. + fn cheap_clone(&self) -> Self; +} + +impl CheapCloneForRead for IoBuffer { + fn cheap_clone(&self) -> Self { + // Cheap clone over an `Arc`. + self.clone() + } +} + /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. /// The owned buffers need to be aligned due to Direct IO requirements. @@ -61,7 +73,7 @@ pub struct BufferedWriter { impl BufferedWriter where B: Buffer + Send + 'static, - Buf: IoBufAligned + Send + Sync + Clone, + Buf: IoBufAligned + Send + Sync + CheapCloneForRead, W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { /// Creates a new buffered writer. diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 687a4fd17a0b..ba7ba13e93da 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -7,7 +7,7 @@ use crate::{ virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice}, }; -use super::{Buffer, OwnedAsyncWriter}; +use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter}; /// A handle to the flush task. pub struct FlushHandle { @@ -110,7 +110,7 @@ impl FlushControl { impl FlushHandle where - Buf: IoBufAligned + Send + Sync + Clone, + Buf: IoBufAligned + Send + Sync + CheapCloneForRead, W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { /// Spawns a new background flush task and obtains a handle. @@ -148,7 +148,7 @@ where let slice = buf.flush(); // Saves a buffer for read while flushing. This also removes reference to the old buffer. - self.maybe_flushed = Some(slice.clone()); + self.maybe_flushed = Some(slice.cheap_clone()); let (request, flush_control) = new_flush_op(slice, offset); From 8a37f412c20d9e8a60f65cf0c3ea0c9cda527b3c Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 25 Nov 2024 05:42:16 +0000 Subject: [PATCH 26/35] remove resolved todos Signed-off-by: Yuchen Liang --- pageserver/src/tenant/remote_timeline_client/download.rs | 2 -- pageserver/src/virtual_file/owned_buffers_io/write/flush.rs | 1 - 2 files changed, 3 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index e9230c58b627..b5c3a52583fd 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -235,8 +235,6 @@ async fn download_object<'a>( Ok(chunk) => chunk, Err(e) => return Err(e), }; - // TODO(yuchen): might have performance issue when using borrowed version? - // Problem: input is Bytes, does not satisify IO alignment requirement. buffered.write_buffered_borrowed(&chunk, ctx).await?; } let inner = buffered.flush_and_into_inner(ctx).await?; diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index ba7ba13e93da..9ec49c33a890 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -17,7 +17,6 @@ pub struct FlushHandle { pub(super) maybe_flushed: Option>, } -// TODO(yuchen): special actions in drop to clean up the join handle? pub struct FlushHandleInner { /// A bi-directional channel that sends (buffer, offset) for writes, /// and receives recyled buffer. From 4284fcd38c3341deaccd281b2b625ba016727b1b Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 25 Nov 2024 15:25:29 +0000 Subject: [PATCH 27/35] fix docs clippy Signed-off-by: Yuchen Liang --- .../src/virtual_file/owned_buffers_io/write.rs | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 88db3b959c59..d2e85bda2bf3 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -43,19 +43,8 @@ pub trait OwnedAsyncWriter { /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch /// small writes into larger writes of size [`Buffer::cap`]. -/// -/// # Passthrough Of Large Writers -/// -/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`] -/// cause the internal buffer to be flushed prematurely so that the large -/// buffered write is passed through to the underlying [`OwnedAsyncWriter`]. -/// -/// This pass-through is generally beneficial for throughput, but if -/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource, -/// unlimited large writes may cause latency or fairness issues. -/// -/// In such cases, a different implementation that always buffers in memory -/// may be preferable. +// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput, +// since we would avoid copying majority of the data into the internal buffer. pub struct BufferedWriter { writer: Arc, /// invariant: always remains Some(buf) except @@ -134,8 +123,6 @@ where .expect("must not use after we returned an error") } - /// TODO(yuchen): For large write, it is possible to implement buffer bypass for aligned parts of the write so that - /// we could avoid copying majority of the data into the internal buffer. pub async fn write_buffered_borrowed( &mut self, chunk: &[u8], From b54764bccb1300382552a7fa04be68383951b071 Mon Sep 17 00:00:00 2001 From: Yuchen Liang <70461588+yliang412@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:10:38 -0500 Subject: [PATCH 28/35] hold timeline open in background task using gate guard (#9825) ## Problem The newly added flush task in https://github.com/neondatabase/neon/pull/9693 should hold timeline gate open, to avoid doing local IO after timeline shutdown completes. ## Solution Pass timeline gate guard to flush background task. The flush task do not need cancellation token b/c it will automatically shutdown when the front writer task drop the channel. - Refactor relevant paths to pass down `&Gate` instead of `GateGuard`. Signed-off-by: Yuchen Liang --- pageserver/benches/bench_ingest.rs | 4 +-- pageserver/src/tenant/ephemeral_file.rs | 30 +++++++++---------- .../src/tenant/remote_timeline_client.rs | 2 ++ .../tenant/remote_timeline_client/download.rs | 19 +++++++----- pageserver/src/tenant/secondary/downloader.rs | 1 + .../tenant/storage_layer/inmemory_layer.rs | 5 ++-- pageserver/src/tenant/storage_layer/layer.rs | 1 + pageserver/src/tenant/timeline.rs | 3 +- .../src/tenant/timeline/layer_manager.rs | 14 +++------ .../virtual_file/owned_buffers_io/write.rs | 18 +++++++++-- .../owned_buffers_io/write/flush.rs | 13 ++++++-- 11 files changed, 64 insertions(+), 46 deletions(-) diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index caacd365b306..b67a9cc47951 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -62,10 +62,8 @@ async fn ingest( let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); let gate = utils::sync::gate::Gate::default(); - let entered = gate.enter().unwrap(); - let layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?; + let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?; let data = Value::Image(Bytes::from(vec![0u8; put_size])); let data_ser_size = data.serialized_size().unwrap() as usize; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 9aecfff384eb..aaec8a4c313a 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -38,9 +38,9 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, - ) -> Result { + ) -> anyhow::Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed); @@ -73,9 +73,10 @@ impl EphemeralFile { buffered_writer: owned_buffers_io::write::BufferedWriter::new( file, || IoBufferMut::with_capacity(TAIL_SZ), + gate.enter()?, ctx, ), - _gate_guard: gate_guard, + _gate_guard: gate.enter()?, }) } } @@ -362,7 +363,7 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) .await .unwrap(); @@ -393,10 +394,9 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); let mutable = file.buffered_writer.inspect_mutable(); let cap = mutable.capacity(); @@ -451,10 +451,9 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); // mutable buffer and maybe_flushed buffer each has `cap` bytes. let cap = file.buffered_writer.inspect_mutable().capacity(); @@ -499,10 +498,9 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); let mutable = file.buffered_writer.inspect_mutable(); let cap = mutable.capacity(); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 007bd3eef083..2982cfb16c3e 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -681,6 +681,7 @@ impl RemoteTimelineClient { layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, local_path: &Utf8Path, + gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -700,6 +701,7 @@ impl RemoteTimelineClient { layer_file_name, layer_metadata, local_path, + gate, cancel, ctx, ) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 41e32a1aa3ba..4f7a51ee68f1 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -57,6 +57,7 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, local_path: &Utf8Path, + gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -85,7 +86,9 @@ pub async fn download_layer_file<'a>( let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( - || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, + || async { + download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await + }, &format!("download {remote_path:?}"), cancel, ) @@ -145,6 +148,7 @@ async fn download_object<'a>( storage: &'a GenericRemoteStorage, src_path: &RemotePath, dst_path: &Utf8PathBuf, + gate: &utils::sync::gate::Gate, cancel: &CancellationToken, #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext, ) -> Result { @@ -219,15 +223,16 @@ async fn download_object<'a>( pausable_failpoint!("before-downloading-layer-stream-pausable"); + let mut buffered = owned_buffers_io::write::BufferedWriter::::new( + destination_file, + || IoBufferMut::with_capacity(super::BUFFER_SIZE), + gate.enter().map_err(|_| DownloadError::Cancelled)?, + ctx, + ); + // TODO: use vectored write (writev) once supported by tokio-epoll-uring. // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { - let mut buffered = - owned_buffers_io::write::BufferedWriter::::new( - destination_file, - || IoBufferMut::with_capacity(super::BUFFER_SIZE), - ctx, - ); while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await { diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 7443261a9c00..03d40cffe3c3 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1181,6 +1181,7 @@ impl<'a> TenantDownloader<'a> { &layer.name, &layer.metadata, &local_path, + &self.secondary_state.gate, &self.secondary_state.cancel, ctx, ) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index af6112d53550..71e53da20f7f 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -555,13 +555,12 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = - EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index a9f1189b4112..8933e8ceb13e 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1149,6 +1149,7 @@ impl LayerInner { &self.desc.layer_name(), &self.metadata(), &self.path, + &timeline.gate, &timeline.cancel, ctx, ) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f6a06e73a790..347393ba5654 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3471,7 +3471,6 @@ impl Timeline { ctx: &RequestContext, ) -> anyhow::Result> { let mut guard = self.layers.write().await; - let gate_guard = self.gate.enter().context("enter gate for inmem layer")?; let last_record_lsn = self.get_last_record_lsn(); ensure!( @@ -3488,7 +3487,7 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - gate_guard, + &self.gate, ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 4293a44dca25..88baa88f24b8 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -182,7 +182,7 @@ impl OpenLayerManager { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> anyhow::Result> { ensure!(lsn.is_aligned()); @@ -212,15 +212,9 @@ impl OpenLayerManager { lsn ); - let new_layer = InMemoryLayer::create( - conf, - timeline_id, - tenant_shard_id, - start_lsn, - gate_guard, - ctx, - ) - .await?; + let new_layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, &gate, ctx) + .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index d2e85bda2bf3..9edb2141e9f4 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -68,11 +68,21 @@ where /// Creates a new buffered writer. /// /// The `buf_new` function provides a way to initialize the owned buffers used by this writer. - pub fn new(writer: Arc, buf_new: impl Fn() -> B, ctx: &RequestContext) -> Self { + pub fn new( + writer: Arc, + buf_new: impl Fn() -> B, + gate_guard: utils::sync::gate::GateGuard, + ctx: &RequestContext, + ) -> Self { Self { writer: writer.clone(), mutable: Some(buf_new()), - flush_handle: FlushHandle::spawn_new(writer, buf_new(), ctx.attached_child()), + flush_handle: FlushHandle::spawn_new( + writer, + buf_new(), + gate_guard, + ctx.attached_child(), + ), bytes_submitted: 0, } } @@ -302,13 +312,15 @@ mod tests { } #[tokio::test] - async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { + async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> { let ctx = test_ctx(); let ctx = &ctx; let recorder = Arc::new(RecorderWriter::default()); + let gate = utils::sync::gate::Gate::default(); let mut writer = BufferedWriter::<_, RecorderWriter>::new( recorder, || IoBufferMut::with_capacity(2), + gate.enter()?, ctx, ); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 9ec49c33a890..5b3997963ead 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -115,14 +115,19 @@ where /// Spawns a new background flush task and obtains a handle. /// /// Note: The background task so we do not need to explicitly maintain a queue of buffers. - pub fn spawn_new(file: Arc, buf: B, ctx: RequestContext) -> Self + pub fn spawn_new( + file: Arc, + buf: B, + gate_guard: utils::sync::gate::GateGuard, + ctx: RequestContext, + ) -> Self where B: Buffer + Send + 'static, { let (front, back) = duplex::mpsc::channel(2); let join_handle = tokio::spawn(async move { - FlushBackgroundTask::new(back, file, ctx) + FlushBackgroundTask::new(back, file, gate_guard, ctx) .run(buf.flush()) .await }); @@ -200,6 +205,8 @@ pub struct FlushBackgroundTask { /// A writter for persisting data to disk. writer: Arc, ctx: RequestContext, + /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk. + _gate_guard: utils::sync::gate::GateGuard, } impl FlushBackgroundTask @@ -211,11 +218,13 @@ where fn new( channel: duplex::mpsc::Duplex, FlushRequest>, file: Arc, + gate_guard: utils::sync::gate::GateGuard, ctx: RequestContext, ) -> Self { FlushBackgroundTask { channel, writer: file, + _gate_guard: gate_guard, ctx, } } From 9f384a8426c3faf6b6f6bfa71d59343a0935b3ad Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 2 Dec 2024 15:58:08 +0000 Subject: [PATCH 29/35] review: remove unused impl Buffer for BytesMut Signed-off-by: Yuchen Liang --- .../virtual_file/owned_buffers_io/write.rs | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 9edb2141e9f4..51c711ab5a28 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -210,33 +210,6 @@ pub trait Buffer { fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; } -impl Buffer for BytesMut { - type IoBuf = BytesMut; - - #[inline(always)] - fn cap(&self) -> usize { - self.capacity() - } - - fn extend_from_slice(&mut self, other: &[u8]) { - BytesMut::extend_from_slice(self, other) - } - - #[inline(always)] - fn pending(&self) -> usize { - self.len() - } - - fn flush(self) -> FullSlice { - self.slice_len() - } - - fn reuse_after_flush(mut iobuf: BytesMut) -> Self { - iobuf.clear(); - iobuf - } -} - impl Buffer for IoBufferMut { type IoBuf = IoBuffer; From bf9a6d0f4c2ae95610322056724a8a181da3d521 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 2 Dec 2024 16:07:57 +0000 Subject: [PATCH 30/35] review: follow Buffer::extend_from_slice trait definition panics if IoBufferMut does not enough capacity left to accomodate the source buffer. Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file/owned_buffers_io/write.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 51c711ab5a28..20bf87812312 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,7 +1,6 @@ mod flush; use std::sync::Arc; -use bytes::BytesMut; use flush::FlushHandle; use tokio_epoll_uring::IoBuf; @@ -218,6 +217,10 @@ impl Buffer for IoBufferMut { } fn extend_from_slice(&mut self, other: &[u8]) { + if self.len() + other.len() > self.cap() { + panic!("Buffer capacity exceeded"); + } + IoBufferMut::extend_from_slice(self, other); } From fac42692d407e5e4d0e04006d2d775cf90f0120f Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 2 Dec 2024 16:17:04 +0000 Subject: [PATCH 31/35] review: fix CheapCloneForRead for FullSlice consider cases where offset != 0 Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index 2cd5a5af35e2..525f447b6dac 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -50,9 +50,10 @@ where B: IoBuf + CheapCloneForRead, { fn cheap_clone(&self) -> Self { - Self { - slice: self.slice.get_ref().cheap_clone().slice_full(), - } + let bounds = self.slice.bounds(); + let clone = self.slice.get_ref().cheap_clone(); + let slice = clone.slice(bounds); + Self { slice } } } From a439d57050dafd603d24e001215213eb5246a029 Mon Sep 17 00:00:00 2001 From: Yuchen Liang <70461588+yliang412@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:28:31 -0500 Subject: [PATCH 32/35] review: cleanup comments + expect_err Co-authored-by: Christian Schwarz --- pageserver/src/virtual_file/owned_buffers_io/write/flush.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 5b3997963ead..7c5c10b91806 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -163,13 +163,14 @@ where } // Wait for an available buffer from the background flush task. + // This is the BACKPRESSURE mechanism: if the flush task can't keep up, + // then the write path will eventually wait for it here. let Some(recycled) = self.inner_mut().channel.recv().await else { return self.handle_error().await; }; // The only other place that could hold a reference to the recycled buffer // is in `Self::maybe_flushed`, but we have already replace it with the new buffer. - let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner()); Ok((recycled, flush_control)) } @@ -193,7 +194,7 @@ where } async fn handle_error(&mut self) -> std::io::Result { - Err(self.shutdown().await.unwrap_err()) + Err(self.shutdown().await.expect_err("flush task only disconnects duplex if it exits with an error")) } } From 6a1aa526198e8049020ba8e6493d3391cdf75a90 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 2 Dec 2024 16:31:44 +0000 Subject: [PATCH 33/35] review: move FlushHandle::handle_error right after ::flush Signed-off-by: Yuchen Liang --- .../src/virtual_file/owned_buffers_io/write/flush.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index 7c5c10b91806..e77603a7baa5 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -175,6 +175,13 @@ where Ok((recycled, flush_control)) } + async fn handle_error(&mut self) -> std::io::Result { + Err(self + .shutdown() + .await + .expect_err("flush task only disconnects duplex if it exits with an error")) + } + /// Cleans up the channel, join the flush task. pub async fn shutdown(&mut self) -> std::io::Result> { let handle = self @@ -192,10 +199,6 @@ where .as_mut() .expect("must not use after we returned an error") } - - async fn handle_error(&mut self) -> std::io::Result { - Err(self.shutdown().await.expect_err("flush task only disconnects duplex if it exits with an error")) - } } /// A background task for flushing data to disk. From 21ca0c499950f1493f48e3119674cd2ec52822dc Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Mon, 2 Dec 2024 16:37:16 +0000 Subject: [PATCH 34/35] review: set channel buffer size to 1 Signed-off-by: Yuchen Liang --- pageserver/src/virtual_file/owned_buffers_io/write/flush.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index e77603a7baa5..9ce8b311bb5c 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -124,7 +124,8 @@ where where B: Buffer + Send + 'static, { - let (front, back) = duplex::mpsc::channel(2); + // It is fine to buffer up to only 1 message. We only 1 message in-flight at a time. + let (front, back) = duplex::mpsc::channel(1); let join_handle = tokio::spawn(async move { FlushBackgroundTask::new(back, file, gate_guard, ctx) From 1da40284c9284b1137363ea2c8490a72e2064579 Mon Sep 17 00:00:00 2001 From: Yuchen Liang Date: Tue, 3 Dec 2024 14:17:26 +0000 Subject: [PATCH 35/35] fix clippy Signed-off-by: Yuchen Liang --- pageserver/src/tenant/timeline/layer_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 88baa88f24b8..3888e7f86a9b 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -213,7 +213,7 @@ impl OpenLayerManager { ); let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, &gate, ctx) + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx) .await?; let layer = Arc::new(new_layer);