diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index b76105399ba5..0c245580eeda 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -363,6 +363,8 @@ pub enum TaskKind { EphemeralFilePreWarmPageCache, + LayerDownload, + #[cfg(test)] UnitTest, } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 6e90b3e8ffcf..1dc451f5c9f4 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -130,8 +130,9 @@ impl BlobWriter { async fn write_all_unbuffered, Buf: IoBuf + Send>( &mut self, src_buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result<(), Error>) { - let (src_buf, res) = self.inner.write_all(src_buf).await; + let (src_buf, res) = self.inner.write_all(src_buf, ctx).await; let nbytes = match res { Ok(nbytes) => nbytes, Err(e) => return (src_buf, Err(e)), @@ -142,9 +143,9 @@ impl BlobWriter { #[inline(always)] /// Flushes the internal buffer to the underlying `VirtualFile`. - pub async fn flush_buffer(&mut self) -> Result<(), Error> { + pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> { let buf = std::mem::take(&mut self.buf); - let (mut buf, res) = self.inner.write_all(buf).await; + let (mut buf, res) = self.inner.write_all(buf, ctx).await; res?; buf.clear(); self.buf = buf; @@ -165,10 +166,11 @@ impl BlobWriter { async fn write_all, Buf: IoBuf + Send>( &mut self, src_buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result<(), Error>) { if !BUFFERED { assert!(self.buf.is_empty()); - return self.write_all_unbuffered(src_buf).await; + return self.write_all_unbuffered(src_buf, ctx).await; } let remaining = Self::CAPACITY - self.buf.len(); let src_buf_len = src_buf.bytes_init(); @@ -183,7 +185,7 @@ impl BlobWriter { } // Then, if the buffer is full, flush it out if self.buf.len() == Self::CAPACITY { - if let Err(e) = self.flush_buffer().await { + if let Err(e) = self.flush_buffer(ctx).await { return (Slice::into_inner(src_buf), Err(e)); } } @@ -199,7 +201,7 @@ impl BlobWriter { assert_eq!(copied, src_buf.len()); Slice::into_inner(src_buf) } else { - let (src_buf, res) = self.write_all_unbuffered(src_buf).await; + let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await; if let Err(e) = res { return (src_buf, Err(e)); } @@ -216,6 +218,7 @@ impl BlobWriter { pub async fn write_blob, Buf: IoBuf + Send>( &mut self, srcbuf: B, + ctx: &RequestContext, ) -> (B::Buf, Result) { let offset = self.offset; @@ -227,7 +230,7 @@ impl BlobWriter { if len < 128 { // Short blob. Write a 1-byte length header io_buf.put_u8(len as u8); - self.write_all(io_buf).await + self.write_all(io_buf, ctx).await } else { // Write a 4-byte length header if len > 0x7fff_ffff { @@ -242,7 +245,7 @@ impl BlobWriter { let mut len_buf = (len as u32).to_be_bytes(); len_buf[0] |= 0x80; io_buf.extend_from_slice(&len_buf[..]); - self.write_all(io_buf).await + self.write_all(io_buf, ctx).await } } .await; @@ -251,7 +254,7 @@ impl BlobWriter { Ok(_) => (), Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), } - let (srcbuf, res) = self.write_all(srcbuf).await; + let (srcbuf, res) = self.write_all(srcbuf, ctx).await; (srcbuf, res.map(|_| offset)) } } @@ -261,8 +264,8 @@ impl BlobWriter { /// /// This function flushes the internal buffer before giving access /// to the underlying `VirtualFile`. - pub async fn into_inner(mut self) -> Result { - self.flush_buffer().await?; + pub async fn into_inner(mut self, ctx: &RequestContext) -> Result { + self.flush_buffer(ctx).await?; Ok(self.inner) } @@ -299,16 +302,16 @@ mod tests { let file = VirtualFile::create(pathbuf.as_path()).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let (_, res) = wtr.write_blob(blob.clone()).await; + let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await; let offs = res?; println!("Writing final blob at offs={offs}"); - wtr.flush_buffer().await?; + wtr.flush_buffer(&ctx).await?; } let file = VirtualFile::open(pathbuf.as_path()).await?; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 96efd13c1b21..8b815a1885d3 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -74,7 +74,7 @@ impl EphemeralFile { pub(crate) async fn write_blob( &mut self, srcbuf: &[u8], - _ctx: &RequestContext, + ctx: &RequestContext, ) -> Result { let pos = self.rw.bytes_written(); @@ -83,15 +83,15 @@ impl EphemeralFile { // short one-byte length header let len_buf = [srcbuf.len() as u8]; - self.rw.write_all_borrowed(&len_buf).await?; + self.rw.write_all_borrowed(&len_buf, ctx).await?; } else { let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); len_buf[0] |= 0x80; - self.rw.write_all_borrowed(&len_buf).await?; + self.rw.write_all_borrowed(&len_buf, ctx).await?; } // Write the payload - self.rw.write_all_borrowed(srcbuf).await?; + self.rw.write_all_borrowed(srcbuf, ctx).await?; Ok(pos) } diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 934400e5be28..42def8858e93 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -35,10 +35,14 @@ impl RW { self.page_cache_file_id } - pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result { + pub(crate) async fn write_all_borrowed( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> Result { // It doesn't make sense to proactively fill the page cache on the Pageserver write path // because Compute is unlikely to access recently written data. - self.rw.write_all_borrowed(srcbuf).await + self.rw.write_all_borrowed(srcbuf, ctx).await } pub(crate) fn bytes_written(&self) -> u64 { @@ -134,6 +138,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi >( &mut self, buf: B, + ctx: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { let buf = buf.slice(..); let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done @@ -150,7 +155,7 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi ); // Do the IO. - let iobuf = match self.file.write_all(buf).await { + let iobuf = match self.file.write_all(buf, ctx).await { (iobuf, Ok(nwritten)) => { assert_eq!(nwritten, buflen); iobuf diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index 4159b5820aef..b37eafb52c5b 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -20,6 +20,7 @@ mod zero_padded; use crate::{ + context::RequestContext, page_cache::PAGE_SZ, virtual_file::owned_buffers_io::{ self, @@ -60,8 +61,12 @@ where self.buffered_writer.as_inner().as_inner() } - pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result { - self.buffered_writer.write_buffered_borrowed(buf).await + pub async fn write_all_borrowed( + &mut self, + buf: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { + self.buffered_writer.write_buffered_borrowed(buf, ctx).await } pub fn bytes_written(&self) -> u64 { diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index c0767345caa8..a54e93c96b96 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -210,6 +210,7 @@ use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; +use crate::context::RequestContext; use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, @@ -505,6 +506,7 @@ impl RemoteTimelineClient { layer_file_name: &LayerFileName, layer_metadata: &LayerFileMetadata, cancel: &CancellationToken, + ctx: &RequestContext, ) -> anyhow::Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( @@ -522,6 +524,7 @@ impl RemoteTimelineClient { layer_file_name, layer_metadata, cancel, + ctx, ) .measure_remote_op( RemoteOpFileKind::Layer, diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 250354ac205d..345a12aa866a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -18,6 +18,7 @@ use tracing::warn; use utils::backoff; use crate::config::PageServerConf; +use crate::context::RequestContext; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerFileName; @@ -40,6 +41,7 @@ use super::{ /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) /// /// Returns the size of the downloaded file. +#[allow(clippy::too_many_arguments)] pub async fn download_layer_file<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, @@ -48,6 +50,7 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerFileName, layer_metadata: &'a LayerFileMetadata, cancel: &CancellationToken, + ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -75,7 +78,7 @@ pub async fn download_layer_file<'a>( let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( - || async { download_object(storage, &remote_path, &temp_file_path, cancel).await }, + || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, &format!("download {remote_path:?}"), cancel, ) @@ -133,6 +136,7 @@ async fn download_object<'a>( src_path: &RemotePath, dst_path: &Utf8PathBuf, cancel: &CancellationToken, + ctx: &RequestContext, ) -> Result { let res = match crate::virtual_file::io_engine::get() { crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"), @@ -208,10 +212,10 @@ async fn download_object<'a>( Err(e) => return Err(e), }; buffered - .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk)) + .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx) .await?; } - let size_tracking = buffered.flush_and_into_inner().await?; + let size_tracking = buffered.flush_and_into_inner(ctx).await?; Ok(size_tracking.into_inner()) } .await?; diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 19f36c722e7b..5c46df268a9c 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -7,6 +7,7 @@ use std::{sync::Arc, time::SystemTime}; use crate::{ config::PageServerConf, + context::RequestContext, disk_usage_eviction_task::DiskUsageEvictionInfo, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, virtual_file::MaybeFatalIo, @@ -316,9 +317,13 @@ pub fn spawn_tasks( let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); + let downloader_task_ctx = RequestContext::new( + TaskKind::SecondaryDownloads, + crate::context::DownloadBehavior::Download, + ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryDownloads, + downloader_task_ctx.task_kind(), None, None, "secondary tenant downloads", @@ -330,6 +335,7 @@ pub fn spawn_tasks( download_req_rx, bg_jobs_clone, cancel_clone, + downloader_task_ctx, ) .await; diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 67f866cb7b53..8a987b5ade45 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -8,6 +8,7 @@ use std::{ use crate::{ config::PageServerConf, + context::RequestContext, disk_usage_eviction_task::{ finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, }, @@ -74,12 +75,14 @@ pub(super) async fn downloader_task( command_queue: tokio::sync::mpsc::Receiver>, background_jobs_can_start: Barrier, cancel: CancellationToken, + root_ctx: RequestContext, ) { let concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, + root_ctx, }; let mut scheduler = Scheduler::new(generator, concurrency); @@ -92,6 +95,7 @@ pub(super) async fn downloader_task( struct SecondaryDownloader { tenant_manager: Arc, remote_storage: GenericRemoteStorage, + root_ctx: RequestContext, } #[derive(Debug, Clone)] @@ -367,11 +371,12 @@ impl JobGenerator { @@ -485,7 +490,7 @@ impl<'a> TenantDownloader<'a> { } } - async fn download(&self) -> Result<(), UpdateError> { + async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_id(); // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure @@ -560,7 +565,7 @@ impl<'a> TenantDownloader<'a> { } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline) + self.download_timeline(timeline, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -742,7 +747,11 @@ impl<'a> TenantDownloader<'a> { .and_then(|x| x) } - async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> { + async fn download_timeline( + &self, + timeline: HeatMapTimeline, + ctx: &RequestContext, + ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); let timeline_path = self @@ -875,6 +884,7 @@ impl<'a> TenantDownloader<'a> { &layer.name, &LayerFileMetadata::from(&layer.metadata), &self.secondary_state.cancel, + ctx, ) .await { diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index a9f840415885..b5538dff3af5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -428,9 +428,15 @@ impl DeltaLayerWriterInner { /// /// The values must be appended in key, lsn order. /// - async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { + async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { let (_, res) = self - .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init()) + .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx) .await; res } @@ -441,9 +447,10 @@ impl DeltaLayerWriterInner { lsn: Lsn, val: Vec, will_init: bool, + ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { assert!(self.lsn_range.start <= lsn); - let (val, res) = self.blob_writer.write_blob(val).await; + let (val, res) = self.blob_writer.write_blob(val, ctx).await; let off = match res { Ok(off) => off, Err(e) => return (val, Err(anyhow::anyhow!(e))), @@ -463,18 +470,23 @@ impl DeltaLayerWriterInner { /// /// Finish writing the delta layer. /// - async fn finish(self, key_end: Key, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + key_end: Key, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; - let mut file = self.blob_writer.into_inner().await?; + let mut file = self.blob_writer.into_inner(ctx).await?; // Write out the index let (index_root_blk, block_buf) = self.tree.finish()?; file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) .await?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; } assert!(self.lsn_range.start < self.lsn_range.end); @@ -494,7 +506,7 @@ impl DeltaLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; let metadata = file @@ -592,8 +604,18 @@ impl DeltaLayerWriter { /// /// The values must be appended in key, lsn order. /// - pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_value(key, lsn, val).await + pub async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner + .as_mut() + .unwrap() + .put_value(key, lsn, val, ctx) + .await } pub async fn put_value_bytes( @@ -602,11 +624,12 @@ impl DeltaLayerWriter { lsn: Lsn, val: Vec, will_init: bool, + ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { self.inner .as_mut() .unwrap() - .put_value_bytes(key, lsn, val, will_init) + .put_value_bytes(key, lsn, val, will_init, ctx) .await } @@ -621,10 +644,11 @@ impl DeltaLayerWriter { mut self, key_end: Key, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { let inner = self.inner.take().unwrap(); let temp_path = inner.path.clone(); - let result = inner.finish(key_end, timeline).await; + let result = inner.finish(key_end, timeline, ctx).await; // The delta layer files can sometimes be really large. Clean them up. if result.is_err() { tracing::warn!( @@ -692,7 +716,7 @@ impl DeltaLayer { // TODO: could use smallvec here, but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; Ok(()) } @@ -1281,7 +1305,13 @@ impl DeltaLayerInner { per_blob_copy.extend_from_slice(data); let (tmp, res) = writer - .put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init) + .put_value_bytes( + key, + lsn, + std::mem::take(&mut per_blob_copy), + will_init, + ctx, + ) .await; per_blob_copy = tmp; res?; @@ -1760,12 +1790,14 @@ mod test { for entry in entries { let (_, res) = writer - .put_value_bytes(entry.key, entry.lsn, entry.value, false) + .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx) .await; res?; } - let resident = writer.finish(entries_meta.key_range.end, &timeline).await?; + let resident = writer + .finish(entries_meta.key_range.end, &timeline, &ctx) + .await?; let inner = resident.as_delta(&ctx).await?; @@ -1951,7 +1983,7 @@ mod test { .await .unwrap(); - let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap(); + let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap(); copied_layer.as_delta(ctx).await.unwrap(); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 6f46a0203b08..1477a1fc33cf 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -357,7 +357,7 @@ impl ImageLayer { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; Ok(()) } @@ -677,9 +677,14 @@ impl ImageLayerWriterInner { /// /// The page versions must be appended in blknum order. /// - async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> { + async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let (_img, res) = self.blob_writer.write_blob(img).await; + let (_img, res) = self.blob_writer.write_blob(img, ctx).await; // TODO: re-use the buffer for `img` further upstack let off = res?; @@ -693,7 +698,11 @@ impl ImageLayerWriterInner { /// /// Finish writing the image layer. /// - async fn finish(self, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -704,7 +713,7 @@ impl ImageLayerWriterInner { .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; } @@ -724,7 +733,7 @@ impl ImageLayerWriterInner { // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; file.seek(SeekFrom::Start(0)).await?; - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res?; let metadata = file @@ -806,8 +815,13 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_image(key, img).await + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_image(key, img, ctx).await } /// @@ -816,8 +830,9 @@ impl ImageLayerWriter { pub(crate) async fn finish( mut self, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline).await + self.inner.take().unwrap().finish(timeline, ctx).await } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index a2ae8ec29d5c..4dacbec2f34e 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -659,14 +659,14 @@ impl InMemoryLayer { let will_init = Value::des(&buf)?.will_init(); let res; (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init) + .put_value_bytes(*key, *lsn, buf, will_init, &ctx) .await; res?; } } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?; + let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?; Ok(Some(delta_layer)) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 2b6934fcee04..ebc0cbf9a4bb 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -14,9 +14,10 @@ use utils::lsn::Lsn; use utils::sync::heavier_once_cell; use crate::config::PageServerConf; -use crate::context::RequestContext; +use crate::context::{DownloadBehavior, RequestContext}; use crate::repository::Key; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; @@ -939,11 +940,20 @@ impl LayerInner { return Err(DownloadError::DownloadRequired); } + let download_ctx = ctx + .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download)) + .unwrap_or(RequestContext::new( + TaskKind::LayerDownload, + DownloadBehavior::Download, + )); + async move { tracing::info!(%reason, "downloading on-demand"); let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - let res = self.download_init_and_wait(timeline, permit).await?; + let res = self + .download_init_and_wait(timeline, permit, download_ctx) + .await?; scopeguard::ScopeGuard::into_inner(init_cancelled); Ok(res) } @@ -982,6 +992,7 @@ impl LayerInner { self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, + ctx: RequestContext, ) -> Result, DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -1011,7 +1022,7 @@ impl LayerInner { .await .unwrap(); - let res = this.download_and_init(timeline, permit).await; + let res = this.download_and_init(timeline, permit, &ctx).await; if let Err(res) = tx.send(res) { match res { @@ -1054,6 +1065,7 @@ impl LayerInner { self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, + ctx: &RequestContext, ) -> anyhow::Result> { let client = timeline .remote_client @@ -1061,7 +1073,12 @@ impl LayerInner { .expect("checked before download_init_and_wait"); let result = client - .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel) + .download_layer_file( + &self.desc.filename(), + &self.metadata(), + &timeline.cancel, + ctx, + ) .await; match result { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 3c0a300a9ac1..22bfa5344533 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4179,7 +4179,7 @@ impl Timeline { }; // Write all the keys we just read into our new image layer. - image_layer_writer.put_image(img_key, img).await?; + image_layer_writer.put_image(img_key, img, ctx).await?; wrote_keys = true; } } @@ -4190,7 +4190,7 @@ impl Timeline { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. start = img_range.end; - let image_layer = image_layer_writer.finish(self).await?; + let image_layer = image_layer_writer.finish(self, ctx).await?; image_layers.push(image_layer); } else { // Special case: the image layer may be empty if this is a sharded tenant and the diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6ea37bf79346..1088101a1316 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -520,7 +520,7 @@ impl Timeline { writer .take() .unwrap() - .finish(prev_key.unwrap().next(), self) + .finish(prev_key.unwrap().next(), self, ctx) .await?, ); writer = None; @@ -562,7 +562,11 @@ impl Timeline { ); } - writer.as_mut().unwrap().put_value(key, lsn, value).await?; + writer + .as_mut() + .unwrap() + .put_value(key, lsn, value, ctx) + .await?; } else { debug!( "Dropping key {} during compaction (it belongs on shard {:?})", @@ -578,7 +582,7 @@ impl Timeline { prev_key = Some(key); } if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); + new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?); } // Sync layers @@ -972,7 +976,7 @@ impl CompactionJobExecutor for TimelineAdaptor { let value = val.load(ctx).await?; - writer.put_value(key, lsn, value).await?; + writer.put_value(key, lsn, value, ctx).await?; prev = Some((key, lsn)); } @@ -988,7 +992,7 @@ impl CompactionJobExecutor for TimelineAdaptor { }); let new_delta_layer = writer - .finish(prev.unwrap().0.next(), &self.timeline) + .finish(prev.unwrap().0.next(), &self.timeline, ctx) .await?; self.new_deltas.push(new_delta_layer); @@ -1058,11 +1062,11 @@ impl TimelineAdaptor { } } }; - image_layer_writer.put_image(key, img).await?; + image_layer_writer.put_image(key, img, ctx).await?; key = key.next(); } } - let image_layer = image_layer_writer.finish(&self.timeline).await?; + let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?; self.new_images.push(image_layer); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 6127b350790a..a17488a28636 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,6 +10,7 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! +use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; use crate::page_cache::PageWriteGuard; @@ -615,6 +616,7 @@ impl VirtualFile { &self, buf: B, mut offset: u64, + ctx: &RequestContext, ) -> (B::Buf, Result<(), Error>) { let buf_len = buf.bytes_init(); if buf_len == 0 { @@ -623,7 +625,7 @@ impl VirtualFile { let mut buf = buf.slice(0..buf_len); while !buf.is_empty() { let res; - (buf, res) = self.write_at(buf, offset).await; + (buf, res) = self.write_at(buf, offset, ctx).await; match res { Ok(0) => { return ( @@ -652,6 +654,7 @@ impl VirtualFile { pub async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> (B::Buf, Result) { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -660,7 +663,7 @@ impl VirtualFile { let mut buf = buf.slice(0..nbytes); while !buf.is_empty() { let res; - (buf, res) = self.write(buf).await; + (buf, res) = self.write(buf, ctx).await; match res { Ok(0) => { return ( @@ -684,9 +687,10 @@ impl VirtualFile { async fn write( &mut self, buf: Slice, + ctx: &RequestContext, ) -> (Slice, Result) { let pos = self.pos; - let (buf, res) = self.write_at(buf, pos).await; + let (buf, res) = self.write_at(buf, pos, ctx).await; let n = match res { Ok(n) => n, Err(e) => return (buf, Err(e)), @@ -724,6 +728,7 @@ impl VirtualFile { &self, buf: Slice, offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ ) -> (Slice, Result) { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -1088,8 +1093,9 @@ impl OwnedAsyncWriter for VirtualFile { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { - let (buf, res) = VirtualFile::write_all(self, buf).await; + let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; res.map(move |v| (v, buf)) } } @@ -1146,6 +1152,9 @@ fn get_open_files() -> &'static OpenFiles { #[cfg(test)] mod tests { + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use super::*; use rand::seq::SliceRandom; use rand::thread_rng; @@ -1177,10 +1186,11 @@ mod tests { &self, buf: B, offset: u64, + ctx: &RequestContext, ) -> Result<(), Error> { match self { MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all_at(buf, offset).await; + let (_buf, res) = file.write_all_at(buf, offset, ctx).await; res } MaybeVirtualFile::File(file) => { @@ -1201,10 +1211,11 @@ mod tests { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> Result<(), Error> { match self { MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all(buf).await; + let (_buf, res) = file.write_all(buf, ctx).await; res.map(|_| ()) } MaybeVirtualFile::File(file) => { @@ -1275,6 +1286,7 @@ mod tests { OF: Fn(Utf8PathBuf, OpenOptions) -> FT, FT: Future>, { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; @@ -1288,7 +1300,7 @@ mod tests { .to_owned(), ) .await?; - file_a.write_all(b"foobar".to_vec()).await?; + file_a.write_all(b"foobar".to_vec(), &ctx).await?; // cannot read from a file opened in write-only mode let _ = file_a.read_string().await.unwrap_err(); @@ -1297,7 +1309,7 @@ mod tests { let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; // cannot write to a file opened in read-only mode - let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err(); + let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); // Try simple read assert_eq!("foobar", file_a.read_string().await?); @@ -1339,8 +1351,8 @@ mod tests { .to_owned(), ) .await?; - file_b.write_all_at(b"BAR".to_vec(), 3).await?; - file_b.write_all_at(b"FOO".to_vec(), 0).await?; + file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; + file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?; assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs index c2817699c375..55b1d0b46bfe 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs @@ -1,4 +1,4 @@ -use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter; +use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter}; use tokio_epoll_uring::{BoundedBuf, IoBuf}; pub struct Writer { @@ -38,8 +38,9 @@ where async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { - let (nwritten, buf) = self.dst.write_all(buf).await?; + let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; self.bytes_amount += u64::try_from(nwritten).unwrap(); Ok((nwritten, buf)) } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 738a64233215..ac5169508f82 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,12 +1,15 @@ use bytes::BytesMut; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use crate::context::RequestContext; + /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. pub trait OwnedAsyncWriter { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + ctx: &RequestContext, ) -> std::io::Result<(usize, B::Buf)>; } @@ -57,8 +60,9 @@ where } #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn flush_and_into_inner(mut self) -> std::io::Result { - self.flush().await?; + pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result { + self.flush(ctx).await?; + let Self { buf, writer } = self; assert!(buf.is_some()); Ok(writer) @@ -72,14 +76,18 @@ where } #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn write_buffered(&mut self, chunk: Slice) -> std::io::Result<(usize, S)> + pub async fn write_buffered( + &mut self, + chunk: Slice, + ctx: &RequestContext, + ) -> std::io::Result<(usize, S)> where S: IoBuf + Send, { let chunk_len = chunk.len(); // avoid memcpy for the middle of the chunk if chunk.len() >= self.buf().cap() { - self.flush().await?; + self.flush(ctx).await?; // do a big write, bypassing `buf` assert_eq!( self.buf @@ -88,7 +96,7 @@ where .pending(), 0 ); - let (nwritten, chunk) = self.writer.write_all(chunk).await?; + let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?; assert_eq!(nwritten, chunk_len); return Ok((nwritten, chunk)); } @@ -104,7 +112,7 @@ where slice = &slice[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); - self.flush().await?; + self.flush(ctx).await?; } } assert!(slice.is_empty(), "by now we should have drained the chunk"); @@ -116,7 +124,11 @@ where /// It is less performant because we always have to copy the borrowed data into the internal buffer /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant /// for large writes. - pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result { + pub async fn write_buffered_borrowed( + &mut self, + mut chunk: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { let chunk_len = chunk.len(); while !chunk.is_empty() { let buf = self.buf.as_mut().expect("must not use after an error"); @@ -127,20 +139,20 @@ where chunk = &chunk[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); - self.flush().await?; + self.flush(ctx).await?; } } Ok(chunk_len) } - async fn flush(&mut self) -> std::io::Result<()> { + async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { let buf = self.buf.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { self.buf = Some(buf); return Ok(()); } - let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?; + let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?; assert_eq!(nwritten, buf_len); self.buf = Some(Buffer::reuse_after_flush(io_buf)); Ok(()) @@ -206,6 +218,7 @@ impl OwnedAsyncWriter for Vec { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + _: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -222,6 +235,8 @@ mod tests { use bytes::BytesMut; use super::*; + use crate::context::{DownloadBehavior, RequestContext}; + use crate::task_mgr::TaskKind; #[derive(Default)] struct RecorderWriter { @@ -231,6 +246,7 @@ mod tests { async fn write_all, Buf: IoBuf + Send>( &mut self, buf: B, + _: &RequestContext, ) -> std::io::Result<(usize, B::Buf)> { let nbytes = buf.bytes_init(); if nbytes == 0 { @@ -243,10 +259,14 @@ mod tests { } } + fn test_ctx() -> RequestContext { + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) + } + macro_rules! write { ($writer:ident, $data:literal) => {{ $writer - .write_buffered(::bytes::Bytes::from_static($data).slice_full()) + .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx()) .await?; }}; } @@ -260,7 +280,7 @@ mod tests { write!(writer, b"c"); write!(writer, b"d"); write!(writer, b"e"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] @@ -276,7 +296,7 @@ mod tests { write!(writer, b"de"); write!(writer, b""); write!(writer, b"fghijk"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] @@ -292,7 +312,7 @@ mod tests { write!(writer, b"bc"); write!(writer, b"d"); write!(writer, b"e"); - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; assert_eq!( recorder.writes, vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] @@ -302,18 +322,20 @@ mod tests { #[tokio::test] async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { + let ctx = test_ctx(); + let ctx = &ctx; let recorder = RecorderWriter::default(); let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); - writer.write_buffered_borrowed(b"abc").await?; - writer.write_buffered_borrowed(b"d").await?; - writer.write_buffered_borrowed(b"e").await?; - writer.write_buffered_borrowed(b"fg").await?; - writer.write_buffered_borrowed(b"hi").await?; - writer.write_buffered_borrowed(b"j").await?; - writer.write_buffered_borrowed(b"klmno").await?; + writer.write_buffered_borrowed(b"abc", ctx).await?; + writer.write_buffered_borrowed(b"d", ctx).await?; + writer.write_buffered_borrowed(b"e", ctx).await?; + writer.write_buffered_borrowed(b"fg", ctx).await?; + writer.write_buffered_borrowed(b"hi", ctx).await?; + writer.write_buffered_borrowed(b"j", ctx).await?; + writer.write_buffered_borrowed(b"klmno", ctx).await?; - let recorder = writer.flush_and_into_inner().await?; + let recorder = writer.flush_and_into_inner(ctx).await?; assert_eq!( recorder.writes, {