From d99b33e6039f73ac070fe608483db334e0412206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 4 Sep 2024 01:08:56 +0200 Subject: [PATCH 01/17] Add timeline offload mechanism --- pageserver/src/tenant.rs | 62 +++++++++++++++-------- pageserver/src/tenant/timeline.rs | 13 ++++- pageserver/src/tenant/timeline/delete.rs | 6 +-- pageserver/src/tenant/timeline/offload.rs | 26 ++++++++++ 4 files changed, 81 insertions(+), 26 deletions(-) create mode 100644 pageserver/src/tenant/timeline/offload.rs diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index db88303f7bdb..66bfc4cbf1b6 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -38,6 +38,7 @@ use std::future::Future; use std::sync::Weak; use std::time::SystemTime; use storage_broker::BrokerClientChannel; +use timeline::offload::offload_timeline; use tokio::io::BufReader; use tokio::sync::watch; use tokio::task::JoinSet; @@ -1876,7 +1877,7 @@ impl Tenant { /// /// Returns whether we have pending compaction task. async fn compaction_iteration( - &self, + self: &Arc, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -1897,21 +1898,27 @@ impl Tenant { // while holding the lock. Then drop the lock and actually perform the // compactions. We don't want to block everything else while the // compaction runs. - let timelines_to_compact = { + let timelines_to_compact_or_offload; + { let timelines = self.timelines.lock().unwrap(); - let timelines_to_compact = timelines + timelines_to_compact_or_offload = timelines .iter() .filter_map(|(timeline_id, timeline)| { - if timeline.is_active() { - Some((*timeline_id, timeline.clone())) - } else { + let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload()); + let can_offload = can_offload && { + !timelines + .iter() + .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) + }; + if (is_active, can_offload) == (false, false) { None + } else { + Some((*timeline_id, timeline.clone(), (is_active, can_offload))) } }) .collect::>(); drop(timelines); - timelines_to_compact - }; + } // Before doing any I/O work, check our circuit breaker if self.compaction_circuit_breaker.lock().unwrap().is_broken() { @@ -1921,20 +1928,31 @@ impl Tenant { let mut has_pending_task = false; - for (timeline_id, timeline) in &timelines_to_compact { - has_pending_task |= timeline - .compact(cancel, EnumSet::empty(), ctx) - .instrument(info_span!("compact_timeline", %timeline_id)) - .await - .inspect_err(|e| match e { - timeline::CompactionError::ShuttingDown => (), - timeline::CompactionError::Other(e) => { - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, e); - } - })?; + for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload + { + let pending_task_left = if *can_compact { + timeline + .compact(cancel, EnumSet::empty(), ctx) + .instrument(info_span!("compact_timeline", %timeline_id)) + .await + .inspect_err(|e| match e { + timeline::CompactionError::ShuttingDown => (), + timeline::CompactionError::Other(e) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, e); + } + })? + } else { + false + }; + has_pending_task |= pending_task_left; + if !pending_task_left && *can_offload { + offload_timeline(self, timeline) + .await + .map_err(|e| timeline::CompactionError::Other(e))?; + } } self.compaction_circuit_breaker diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2113a1d72600..f142ae40e7d3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -7,6 +7,7 @@ pub(crate) mod handle; mod init; pub mod layer_manager; pub(crate) mod logical_size; +pub mod offload; pub mod span; pub mod uninit; mod walreceiver; @@ -1552,6 +1553,17 @@ impl Timeline { } } + /// Checks if the internal state of the timeline is consistent with it being able to be offloaded. + /// This is neccessary but not sufficient for offloading of the timeline as it might have + /// child timelines that are not offloaded yet. + pub(crate) fn can_offload(&self) -> bool { + if self.remote_client.is_archived() != Some(true) { + return false; + } + + true + } + /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending /// compaction tasks. pub(crate) async fn compact( @@ -1814,7 +1826,6 @@ impl Timeline { self.current_state() == TimelineState::Active } - #[allow(unused)] pub(crate) fn is_archived(&self) -> Option { self.remote_client.is_archived() } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 90db08ea819c..ca897ac0c01b 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -137,7 +137,7 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<( /// It is important that this gets called when DeletionGuard is being held. /// For more context see comments in [`DeleteTimelineFlow::prepare`] -async fn remove_timeline_from_tenant( +pub(super) async fn remove_timeline_from_tenant( tenant: &Tenant, timeline: &Timeline, _: &DeletionGuard, // using it as a witness @@ -290,7 +290,7 @@ impl DeleteTimelineFlow { Ok(()) } - fn prepare( + pub(super) fn prepare( tenant: &Tenant, timeline_id: TimelineId, ) -> Result<(Arc, DeletionGuard), DeleteTimelineError> { @@ -400,7 +400,7 @@ impl DeleteTimelineFlow { } } -struct DeletionGuard(OwnedMutexGuard); +pub(super) struct DeletionGuard(OwnedMutexGuard); impl Deref for DeletionGuard { type Target = DeleteTimelineFlow; diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs new file mode 100644 index 000000000000..2d7f13f4c31b --- /dev/null +++ b/pageserver/src/tenant/timeline/offload.rs @@ -0,0 +1,26 @@ +use std::sync::Arc; + +use crate::tenant::Tenant; + +use super::{ + delete::{delete_local_timeline_directory, remove_timeline_from_tenant, DeleteTimelineFlow}, + Timeline, +}; + +pub(crate) async fn offload_timeline( + tenant: &Tenant, + timeline: &Arc, +) -> anyhow::Result<()> { + let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?; + + // TODO extend guard mechanism above with method + // to make deletions possible while offloading is in progress + + // TODO mark timeline as offloaded in S3 + + let conf = &tenant.conf; + delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?; + + remove_timeline_from_tenant(tenant, &timeline, &guard).await?; + Ok(()) +} From fc331f0155fa2808e77f5c6300029e8d073cb712 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 20 Sep 2024 16:02:56 +0200 Subject: [PATCH 02/17] Add persistence for offloading in memory --- pageserver/src/tenant.rs | 115 +++++++++++++++------- pageserver/src/tenant/timeline/delete.rs | 2 +- pageserver/src/tenant/timeline/offload.rs | 42 +++++++- 3 files changed, 123 insertions(+), 36 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 66bfc4cbf1b6..861a27c760eb 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -290,6 +290,9 @@ pub struct Tenant { /// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating` timelines_creating: std::sync::Mutex>, + /// Possibly offloaded and archived timelines + timelines_offloaded: Mutex>, + // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration @@ -481,6 +484,20 @@ impl WalRedoManager { } } +pub struct OffloadedTimeline { + pub timeline_id: TimelineId, + pub ancestor_timeline_id: Option, +} + +impl OffloadedTimeline { + fn from_timeline(timeline: &Timeline) -> Self { + Self { + timeline_id: timeline.timeline_id, + ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + } + } +} + #[derive(Debug, thiserror::Error, PartialEq, Eq)] pub enum GetTimelineError { #[error("Timeline is shutting down")] @@ -1405,11 +1422,35 @@ impl Tenant { state: TimelineArchivalState, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); - let timeline = { + let timeline_or_unarchive_offloaded = 'outer: { let timelines = self.timelines.lock().unwrap(); let Some(timeline) = timelines.get(&timeline_id) else { - return Err(TimelineArchivalError::NotFound); + let offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + let Some(offloaded) = offloaded_timelines.get(&timeline_id) else { + return Err(TimelineArchivalError::NotFound); + }; + if state == TimelineArchivalState::Archived { + // It's offloaded, so nothing to do for archival + return Ok(()); + } + if let Some(ancestor_timeline_id) = offloaded.ancestor_timeline_id { + let has_archived_parent = + if let Some(ancestor_timeline) = timelines.get(&timeline_id) { + ancestor_timeline.is_archived() == Some(true) + } else if offloaded_timelines.contains_key(&ancestor_timeline_id) { + true + } else { + error!("ancestor timeline {ancestor_timeline_id} not found"); + return Err(TimelineArchivalError::NotFound); + }; + if has_archived_parent { + return Err(TimelineArchivalError::HasArchivedParent( + ancestor_timeline_id, + )); + } + } + break 'outer None; }; if state == TimelineArchivalState::Unarchived { @@ -1422,42 +1463,48 @@ impl Tenant { } } - // Ensure that there are no non-archived child timelines - let children: Vec = timelines - .iter() - .filter_map(|(id, entry)| { - if entry.get_ancestor_timeline_id() != Some(timeline_id) { - return None; - } - if entry.is_archived() == Some(true) { - return None; - } - Some(*id) - }) - .collect(); + if state == TimelineArchivalState::Archived { + // Ensure that there are no non-archived child timelines + let children: Vec = timelines + .iter() + .filter_map(|(id, entry)| { + if entry.get_ancestor_timeline_id() != Some(timeline_id) { + return None; + } + if entry.is_archived() == Some(true) { + return None; + } + Some(*id) + }) + .collect(); - if !children.is_empty() && state == TimelineArchivalState::Archived { - return Err(TimelineArchivalError::HasUnarchivedChildren(children)); + if !children.is_empty() { + return Err(TimelineArchivalError::HasUnarchivedChildren(children)); + } } - Arc::clone(timeline) + Some(Arc::clone(timeline)) }; - let upload_needed = timeline - .remote_client - .schedule_index_upload_for_timeline_archival_state(state)?; - - if upload_needed { - info!("Uploading new state"); - const MAX_WAIT: Duration = Duration::from_secs(10); - let Ok(v) = - tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await - else { - tracing::warn!("reached timeout for waiting on upload queue"); - return Err(TimelineArchivalError::Timeout); - }; - v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; + if let Some(timeline) = timeline_or_unarchive_offloaded { + let upload_needed = timeline + .remote_client + .schedule_index_upload_for_timeline_archival_state(state)?; + + if upload_needed { + info!("Uploading new state"); + const MAX_WAIT: Duration = Duration::from_secs(10); + let Ok(v) = + tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await + else { + tracing::warn!("reached timeout for waiting on upload queue"); + return Err(TimelineArchivalError::Timeout); + }; + v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; + } + Ok(()) + } else { + Ok(()) } - Ok(()) } pub(crate) fn tenant_shard_id(&self) -> TenantShardId { @@ -1950,6 +1997,7 @@ impl Tenant { has_pending_task |= pending_task_left; if !pending_task_left && *can_offload { offload_timeline(self, timeline) + .instrument(info_span!("offload_timeline", %timeline_id)) .await .map_err(|e| timeline::CompactionError::Other(e))?; } @@ -2861,6 +2909,7 @@ impl Tenant { constructed_at: Instant::now(), timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), + timelines_offloaded: Mutex::new(HashMap::new()), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index ca897ac0c01b..be7546204284 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -137,7 +137,7 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<( /// It is important that this gets called when DeletionGuard is being held. /// For more context see comments in [`DeleteTimelineFlow::prepare`] -pub(super) async fn remove_timeline_from_tenant( +async fn remove_timeline_from_tenant( tenant: &Tenant, timeline: &Timeline, _: &DeletionGuard, // using it as a witness diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 2d7f13f4c31b..201866affe3d 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -1,9 +1,9 @@ use std::sync::Arc; -use crate::tenant::Tenant; +use crate::tenant::{OffloadedTimeline, Tenant}; use super::{ - delete::{delete_local_timeline_directory, remove_timeline_from_tenant, DeleteTimelineFlow}, + delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}, Timeline, }; @@ -11,6 +11,7 @@ pub(crate) async fn offload_timeline( tenant: &Tenant, timeline: &Arc, ) -> anyhow::Result<()> { + tracing::info!("offloading archived timeline"); let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?; // TODO extend guard mechanism above with method @@ -22,5 +23,42 @@ pub(crate) async fn offload_timeline( delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?; remove_timeline_from_tenant(tenant, &timeline, &guard).await?; + + { + let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); + offloaded_timelines.insert( + timeline.timeline_id, + OffloadedTimeline::from_timeline(&timeline), + ); + } + + Ok(()) +} + +/// It is important that this gets called when DeletionGuard is being held. +/// For more context see comments in [`DeleteTimelineFlow::prepare`] +async fn remove_timeline_from_tenant( + tenant: &Tenant, + timeline: &Timeline, + _: &DeletionGuard, // using it as a witness +) -> anyhow::Result<()> { + // Remove the timeline from the map. + let mut timelines = tenant.timelines.lock().unwrap(); + let children_exist = timelines + .iter() + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); + // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. + // We already deleted the layer files, so it's probably best to panic. + // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) + if children_exist { + panic!("Timeline grew children while we removed layer files"); + } + + timelines + .remove(&timeline.timeline_id) + .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); + + drop(timelines); + Ok(()) } From ae2abaee089862ce10040809a41e400c7f76b575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 13 Sep 2024 12:45:28 +0200 Subject: [PATCH 03/17] Implement unoffloading --- pageserver/src/http/routes.rs | 4 +- pageserver/src/tenant.rs | 97 ++++++++++++++++++++++++++++------- 2 files changed, 81 insertions(+), 20 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1cc5502bd602..8f62e54f5a1c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -703,6 +703,8 @@ async fn timeline_archival_config_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); @@ -713,7 +715,7 @@ async fn timeline_archival_config_handler( .get_attached_tenant_shard(tenant_shard_id)?; tenant - .apply_timeline_archival_config(timeline_id, request_data.state) + .apply_timeline_archival_config(timeline_id, request_data.state, ctx) .await?; Ok::<_, ApiError>(()) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 861a27c760eb..ea6b0d6248c1 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1417,9 +1417,10 @@ impl Tenant { } pub(crate) async fn apply_timeline_archival_config( - &self, + self: &Arc, timeline_id: TimelineId, state: TimelineArchivalState, + ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); let timeline_or_unarchive_offloaded = 'outer: { @@ -1485,26 +1486,84 @@ impl Tenant { Some(Arc::clone(timeline)) }; - if let Some(timeline) = timeline_or_unarchive_offloaded { - let upload_needed = timeline - .remote_client - .schedule_index_upload_for_timeline_archival_state(state)?; - - if upload_needed { - info!("Uploading new state"); - const MAX_WAIT: Duration = Duration::from_secs(10); - let Ok(v) = - tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await - else { - tracing::warn!("reached timeout for waiting on upload queue"); - return Err(TimelineArchivalError::Timeout); - }; - v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; - } - Ok(()) + let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { + timeline } else { - Ok(()) + let cancel = self.cancel.clone(); + let timeline_preload = self.load_timeline_metadata( + timeline_id, + self.remote_storage.clone(), + cancel, + ) + .await; + + let index_part = match timeline_preload.index_part { + Ok(index_part) => { + debug!("remote index part exists for timeline {timeline_id}"); + index_part + } + Err(DownloadError::NotFound) => { + info!(%timeline_id, "index_part not found on remote"); + return Err(TimelineArchivalError::NotFound); + } + Err(e) => { + // Some (possibly ephemeral) error happened during index_part download. + warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); + return Err(TimelineArchivalError::Other( + anyhow::Error::new(e).context("downloading index_part from remote storage"), + )); + } + }; + let index_part = match index_part { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted(_index_part) => { + info!("timeline is deleted according to index_part.json"); + return Err(TimelineArchivalError::NotFound); + } + }; + let remote_metadata = index_part.metadata.clone(); + let timeline_resources = self.build_timeline_resources(timeline_id); + self.load_remote_timeline( + timeline_id, + index_part, + remote_metadata, + timeline_resources, + &ctx, + ) + .await + .with_context(|| { + format!( + "failed to load remote timeline {} for tenant {}", + timeline_id, self.tenant_shard_id + ) + })?; + let timelines = self.timelines.lock().unwrap(); + if let Some(timeline) = timelines.get(&timeline_id) { + Arc::clone(timeline) + } else { + warn!("timeline not available directly after attach"); + return Err(TimelineArchivalError::Other(anyhow::anyhow!( + "timeline not available directly after attach" + ))); + } + }; + + let upload_needed = timeline + .remote_client + .schedule_index_upload_for_timeline_archival_state(state)?; + + if upload_needed { + info!("Uploading new state"); + const MAX_WAIT: Duration = Duration::from_secs(10); + let Ok(v) = + tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await + else { + tracing::warn!("reached timeout for waiting on upload queue"); + return Err(TimelineArchivalError::Timeout); + }; + v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; } + Ok(()) } pub(crate) fn tenant_shard_id(&self) -> TenantShardId { From 386ab883a88d0dd9a17edf3652c5e7e11299ce06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 24 Sep 2024 16:32:44 +0200 Subject: [PATCH 04/17] fmt --- pageserver/src/tenant/timeline/delete.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index be7546204284..13b01e910769 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -308,7 +308,10 @@ impl DeleteTimelineFlow { let timeline = match timelines.get(&timeline_id) { Some(t) => t, - None => return Err(DeleteTimelineError::NotFound), + None => { + let offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); + return Err(DeleteTimelineError::NotFound); + } }; // Ensure that there are no child timelines **attached to that pageserver**, From 1b4212d4e7023692c36fc29781674a32a88311ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 24 Sep 2024 18:05:00 +0200 Subject: [PATCH 05/17] Put OffloadedTimeline into Arc --- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/timeline/offload.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ea6b0d6248c1..8c5f67ede088 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -291,7 +291,7 @@ pub struct Tenant { timelines_creating: std::sync::Mutex>, /// Possibly offloaded and archived timelines - timelines_offloaded: Mutex>, + timelines_offloaded: Mutex>>, // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 201866affe3d..8aed7971bb17 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -28,7 +28,7 @@ pub(crate) async fn offload_timeline( let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); offloaded_timelines.insert( timeline.timeline_id, - OffloadedTimeline::from_timeline(&timeline), + Arc::new(OffloadedTimeline::from_timeline(&timeline)), ); } From 21139c8c57989eee81ad58559fb8127a3568cff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 30 Sep 2024 22:50:15 +0200 Subject: [PATCH 06/17] Implement offloaded timeline deletion --- pageserver/src/tenant.rs | 54 +++++++++++-- pageserver/src/tenant/gc_block.rs | 4 +- pageserver/src/tenant/timeline/delete.rs | 98 +++++++++++++++-------- pageserver/src/tenant/timeline/offload.rs | 7 +- 4 files changed, 120 insertions(+), 43 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8c5f67ede088..8c97fc882f7e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -485,15 +485,60 @@ impl WalRedoManager { } pub struct OffloadedTimeline { + pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, + + // TODO: once we persist offloaded state, make this lazily constructed + pub remote_client: Arc, + + /// Prevent two tasks from deleting the timeline at the same time. If held, the + /// timeline is being deleted. If 'true', the timeline has already been deleted. + pub delete_progress: Arc>, } impl OffloadedTimeline { fn from_timeline(timeline: &Timeline) -> Self { Self { + tenant_shard_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + + remote_client: timeline.remote_client.clone(), + delete_progress: timeline.delete_progress.clone(), + } + } +} + +#[derive(Clone)] +pub enum TimelineOrOffloaded { + Timeline(Arc), + Offloaded(Arc), +} + +impl TimelineOrOffloaded { + pub fn tenant_shard_id(&self) -> TenantShardId { + match self { + TimelineOrOffloaded::Timeline(timeline) => timeline.tenant_shard_id, + TimelineOrOffloaded::Offloaded(offloaded) => offloaded.tenant_shard_id, + } + } + pub fn timeline_id(&self) -> TimelineId { + match self { + TimelineOrOffloaded::Timeline(timeline) => timeline.timeline_id, + TimelineOrOffloaded::Offloaded(offloaded) => offloaded.timeline_id, + } + } + pub fn delete_progress(&self) -> &Arc> { + match self { + TimelineOrOffloaded::Timeline(timeline) => &timeline.delete_progress, + TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress, + } + } + pub fn remote_client(&self) -> &Arc { + match self { + TimelineOrOffloaded::Timeline(timeline) => &timeline.remote_client, + TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.remote_client, } } } @@ -1490,12 +1535,9 @@ impl Tenant { timeline } else { let cancel = self.cancel.clone(); - let timeline_preload = self.load_timeline_metadata( - timeline_id, - self.remote_storage.clone(), - cancel, - ) - .await; + let timeline_preload = self + .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) + .await; let index_part = match timeline_preload.index_part { Ok(index_part) => { diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs index f7a7836a129c..373779ddb882 100644 --- a/pageserver/src/tenant/gc_block.rs +++ b/pageserver/src/tenant/gc_block.rs @@ -141,14 +141,14 @@ impl GcBlock { Ok(()) } - pub(crate) fn before_delete(&self, timeline: &super::Timeline) { + pub(crate) fn before_delete(&self, timeline_id: &super::TimelineId) { let unblocked = { let mut g = self.reasons.lock().unwrap(); if g.is_empty() { return; } - g.remove(&timeline.timeline_id); + g.remove(timeline_id); BlockingReasons::clean_and_summarize(g).is_none() }; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 13b01e910769..6865196df745 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -15,7 +15,7 @@ use crate::{ tenant::{ metadata::TimelineMetadata, remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, - CreateTimelineCause, DeleteTimelineError, Tenant, + CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded, }, }; @@ -24,12 +24,14 @@ use super::{Timeline, TimelineResources}; /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. -async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - match timeline - .remote_client +async fn set_deleted_in_remote_index( + timeline: &TimelineOrOffloaded, +) -> Result<(), DeleteTimelineError> { + let res = timeline + .remote_client() .persist_index_part_with_deleted_flag() - .await - { + .await; + match res { // If we (now, or already) marked it successfully as deleted, we can proceed Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), // Bail out otherwise @@ -127,9 +129,9 @@ pub(super) async fn delete_local_timeline_directory( } /// Removes remote layers and an index file after them. -async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> { +async fn delete_remote_layers_and_index(timeline: &TimelineOrOffloaded) -> anyhow::Result<()> { timeline - .remote_client + .remote_client() .delete_all() .await .context("delete_all") @@ -137,27 +139,41 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<( /// It is important that this gets called when DeletionGuard is being held. /// For more context see comments in [`DeleteTimelineFlow::prepare`] -async fn remove_timeline_from_tenant( +async fn remove_maybe_offloaded_timeline_from_tenant( tenant: &Tenant, - timeline: &Timeline, + timeline: &TimelineOrOffloaded, _: &DeletionGuard, // using it as a witness ) -> anyhow::Result<()> { // Remove the timeline from the map. + // This observes the locking order between timelines and timelines_offloaded let mut timelines = tenant.timelines.lock().unwrap(); + let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); + let offloaded_children_exist = timelines_offloaded + .iter() + .any(|(_, entry)| &entry.ancestor_timeline_id == &Some(timeline.timeline_id())); let children_exist = timelines .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id)); - // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. - // We already deleted the layer files, so it's probably best to panic. - // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) - if children_exist { + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id())); + // XXX this can happen because of race conditions with branch creation. + // We already deleted the remote layer files, so it's probably best to panic. + if children_exist || offloaded_children_exist { panic!("Timeline grew children while we removed layer files"); } - timelines - .remove(&timeline.timeline_id) - .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); + match timeline { + TimelineOrOffloaded::Timeline(timeline) => { + timelines.remove(&timeline.timeline_id).expect( + "timeline that we were deleting was concurrently removed from 'timelines' map", + ); + } + TimelineOrOffloaded::Offloaded(timeline) => { + timelines_offloaded + .remove(&timeline.timeline_id) + .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map"); + } + } + drop(timelines_offloaded); drop(timelines); Ok(()) @@ -207,9 +223,11 @@ impl DeleteTimelineFlow { guard.mark_in_progress()?; // Now that the Timeline is in Stopping state, request all the related tasks to shut down. - timeline.shutdown(super::ShutdownMode::Hard).await; + if let TimelineOrOffloaded::Timeline(timeline) = &timeline { + timeline.shutdown(super::ShutdownMode::Hard).await; + } - tenant.gc_block.before_delete(&timeline); + tenant.gc_block.before_delete(&timeline.timeline_id()); fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { Err(anyhow::anyhow!( @@ -285,6 +303,7 @@ impl DeleteTimelineFlow { guard.mark_in_progress()?; + let timeline = TimelineOrOffloaded::Timeline(timeline); Self::schedule_background(guard, tenant.conf, tenant, timeline); Ok(()) @@ -293,7 +312,7 @@ impl DeleteTimelineFlow { pub(super) fn prepare( tenant: &Tenant, timeline_id: TimelineId, - ) -> Result<(Arc, DeletionGuard), DeleteTimelineError> { + ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { // Note the interaction between this guard and deletion guard. // Here we attempt to lock deletion guard when we're holding a lock on timelines. // This is important because when you take into account `remove_timeline_from_tenant` @@ -307,10 +326,13 @@ impl DeleteTimelineFlow { let timelines = tenant.timelines.lock().unwrap(); let timeline = match timelines.get(&timeline_id) { - Some(t) => t, + Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), None => { let offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); - return Err(DeleteTimelineError::NotFound); + match offloaded_timelines.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), + None => return Err(DeleteTimelineError::NotFound), + } } }; @@ -337,30 +359,32 @@ impl DeleteTimelineFlow { // to remove the timeline from it. // Always if you have two locks that are taken in different order this can result in a deadlock. - let delete_progress = Arc::clone(&timeline.delete_progress); + let delete_progress = Arc::clone(timeline.delete_progress()); let delete_lock_guard = match delete_progress.try_lock_owned() { Ok(guard) => DeletionGuard(guard), Err(_) => { // Unfortunately if lock fails arc is consumed. return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( - &timeline.delete_progress, + timeline.delete_progress(), ))); } }; - timeline.set_state(TimelineState::Stopping); + if let TimelineOrOffloaded::Timeline(timeline) = &timeline { + timeline.set_state(TimelineState::Stopping); + } - Ok((Arc::clone(timeline), delete_lock_guard)) + Ok((timeline, delete_lock_guard)) } fn schedule_background( guard: DeletionGuard, conf: &'static PageServerConf, tenant: Arc, - timeline: Arc, + timeline: TimelineOrOffloaded, ) { - let tenant_shard_id = timeline.tenant_shard_id; - let timeline_id = timeline.timeline_id; + let tenant_shard_id = timeline.tenant_shard_id(); + let timeline_id = timeline.timeline_id(); task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), @@ -371,7 +395,9 @@ impl DeleteTimelineFlow { async move { if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await { error!("Error: {err:#}"); - timeline.set_broken(format!("{err:#}")) + if let TimelineOrOffloaded::Timeline(timeline) = timeline { + timeline.set_broken(format!("{err:#}")) + } }; Ok(()) } @@ -383,15 +409,19 @@ impl DeleteTimelineFlow { mut guard: DeletionGuard, conf: &PageServerConf, tenant: &Tenant, - timeline: &Timeline, + timeline: &TimelineOrOffloaded, ) -> Result<(), DeleteTimelineError> { - delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?; + // Offloaded timelines have no local state + // TODO: once we persist offloaded information, delete the timeline from there, too + if let TimelineOrOffloaded::Timeline(timeline) = timeline { + delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?; + } delete_remote_layers_and_index(timeline).await?; pausable_failpoint!("in_progress_delete"); - remove_timeline_from_tenant(tenant, timeline, &guard).await?; + remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?; *guard = Self::Finished; diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 8aed7971bb17..fb906d906b41 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use crate::tenant::{OffloadedTimeline, Tenant}; +use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded}; use super::{ delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}, @@ -14,6 +14,11 @@ pub(crate) async fn offload_timeline( tracing::info!("offloading archived timeline"); let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?; + let TimelineOrOffloaded::Timeline(timeline) = timeline else { + tracing::error!("timeline already offloaded, but given timeline object"); + return Ok(()); + }; + // TODO extend guard mechanism above with method // to make deletions possible while offloading is in progress From 8505c8223f880364ff9da5b37f9902b9d7135b0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 1 Oct 2024 15:06:54 +0200 Subject: [PATCH 07/17] clippy --- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/timeline/delete.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8c97fc882f7e..6f983ad60eae 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2100,7 +2100,7 @@ impl Tenant { offload_timeline(self, timeline) .instrument(info_span!("offload_timeline", %timeline_id)) .await - .map_err(|e| timeline::CompactionError::Other(e))?; + .map_err(timeline::CompactionError::Other)?; } } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 6865196df745..305c5758ccf6 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -150,7 +150,7 @@ async fn remove_maybe_offloaded_timeline_from_tenant( let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); let offloaded_children_exist = timelines_offloaded .iter() - .any(|(_, entry)| &entry.ancestor_timeline_id == &Some(timeline.timeline_id())); + .any(|(_, entry)| entry.ancestor_timeline_id == Some(timeline.timeline_id())); let children_exist = timelines .iter() .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id())); From 83840afcd04e72d98c7304b613e4e9afa1c9ed3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 3 Oct 2024 02:44:22 +0200 Subject: [PATCH 08/17] Review comments --- pageserver/src/tenant.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6f983ad60eae..7f6c061f62b0 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -291,6 +291,7 @@ pub struct Tenant { timelines_creating: std::sync::Mutex>, /// Possibly offloaded and archived timelines + /// **Lock order**: if acquring both, acquire`timelines` before `timelines_offloaded` timelines_offloaded: Mutex>>, // This mutex prevents creation of new timelines during GC. @@ -1488,6 +1489,7 @@ impl Tenant { true } else { error!("ancestor timeline {ancestor_timeline_id} not found"); + debug_assert!(false); return Err(TimelineArchivalError::NotFound); }; if has_archived_parent { @@ -1545,7 +1547,7 @@ impl Tenant { index_part } Err(DownloadError::NotFound) => { - info!(%timeline_id, "index_part not found on remote"); + error!(%timeline_id, "index_part not found on remote"); return Err(TimelineArchivalError::NotFound); } Err(e) => { From 199033539b3b8047121bb191c9cbc65355abf6c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 3 Oct 2024 02:50:05 +0200 Subject: [PATCH 09/17] new_state --- pageserver/src/tenant.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7f6c061f62b0..6de803a1ca36 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1465,7 +1465,7 @@ impl Tenant { pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, - state: TimelineArchivalState, + new_state: TimelineArchivalState, ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); @@ -1477,7 +1477,7 @@ impl Tenant { let Some(offloaded) = offloaded_timelines.get(&timeline_id) else { return Err(TimelineArchivalError::NotFound); }; - if state == TimelineArchivalState::Archived { + if new_state == TimelineArchivalState::Archived { // It's offloaded, so nothing to do for archival return Ok(()); } @@ -1501,7 +1501,7 @@ impl Tenant { break 'outer None; }; - if state == TimelineArchivalState::Unarchived { + if new_state == TimelineArchivalState::Unarchived { if let Some(ancestor_timeline) = timeline.ancestor_timeline() { if ancestor_timeline.is_archived() == Some(true) { return Err(TimelineArchivalError::HasArchivedParent( @@ -1511,7 +1511,7 @@ impl Tenant { } } - if state == TimelineArchivalState::Archived { + if new_state == TimelineArchivalState::Archived { // Ensure that there are no non-archived child timelines let children: Vec = timelines .iter() @@ -1594,7 +1594,7 @@ impl Tenant { let upload_needed = timeline .remote_client - .schedule_index_upload_for_timeline_archival_state(state)?; + .schedule_index_upload_for_timeline_archival_state(new_state)?; if upload_needed { info!("Uploading new state"); From 6063debebc0aaedc960152aab9f91121d00dc018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 4 Oct 2024 16:23:02 +0200 Subject: [PATCH 10/17] Remove timeline from offloaded ones if unarchiving --- pageserver/src/tenant.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 6de803a1ca36..a2667fd6b2f1 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1583,6 +1583,10 @@ impl Tenant { })?; let timelines = self.timelines.lock().unwrap(); if let Some(timeline) = timelines.get(&timeline_id) { + let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + if offloaded_timelines.remove(&timeline_id).is_none() { + warn!("timeline already removed from offloaded timelines"); + } Arc::clone(timeline) } else { warn!("timeline not available directly after attach"); From 54801366c06922091120d4dd4ac20cd81c24fc9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 4 Oct 2024 16:30:26 +0200 Subject: [PATCH 11/17] Review comments --- pageserver/src/tenant.rs | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a2667fd6b2f1..d9bf332d7111 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1536,6 +1536,7 @@ impl Tenant { let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { + // Unarchive an offloaded timeline let cancel = self.cancel.clone(); let timeline_preload = self .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) @@ -2059,11 +2060,12 @@ impl Tenant { .iter() .filter_map(|(timeline_id, timeline)| { let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload()); - let can_offload = can_offload && { + let has_no_unarchived_children = { !timelines .iter() .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) }; + let can_offload = can_offload && has_no_unarchived_children; if (is_active, can_offload) == (false, false) { None } else { @@ -2085,24 +2087,26 @@ impl Tenant { for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload { let pending_task_left = if *can_compact { - timeline - .compact(cancel, EnumSet::empty(), ctx) - .instrument(info_span!("compact_timeline", %timeline_id)) - .await - .inspect_err(|e| match e { - timeline::CompactionError::ShuttingDown => (), - timeline::CompactionError::Other(e) => { - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, e); - } - })? + Some( + timeline + .compact(cancel, EnumSet::empty(), ctx) + .instrument(info_span!("compact_timeline", %timeline_id)) + .await + .inspect_err(|e| match e { + timeline::CompactionError::ShuttingDown => (), + timeline::CompactionError::Other(e) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, e); + } + })?, + ) } else { - false + None }; - has_pending_task |= pending_task_left; - if !pending_task_left && *can_offload { + has_pending_task |= pending_task_left.unwrap_or(false); + if pending_task_left == Some(false) && *can_offload { offload_timeline(self, timeline) .instrument(info_span!("offload_timeline", %timeline_id)) .await From 38f33a6c6029975f92f3c1b18b943f755c46b2d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 4 Oct 2024 16:52:17 +0200 Subject: [PATCH 12/17] Move out some components of apply_timeline_archival_config --- pageserver/src/tenant.rs | 114 +++++++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 40 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d9bf332d7111..0785defe60de 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1462,6 +1462,68 @@ impl Tenant { } } + fn check_to_be_archived_has_no_unarchived_children( + timeline_id: TimelineId, + timelines: &std::sync::MutexGuard<'_, HashMap>>, + ) -> Result<(), TimelineArchivalError> { + let children: Vec = timelines + .iter() + .filter_map(|(id, entry)| { + if entry.get_ancestor_timeline_id() != Some(timeline_id) { + return None; + } + if entry.is_archived() == Some(true) { + return None; + } + Some(*id) + }) + .collect(); + + if !children.is_empty() { + return Err(TimelineArchivalError::HasUnarchivedChildren(children)); + } + Ok(()) + } + + fn check_ancestor_of_to_be_unarchived_is_not_archived( + timeline_id: TimelineId, + ancestor_timeline_id: TimelineId, + timelines: &std::sync::MutexGuard<'_, HashMap>>, + offloaded_timelines: &std::sync::MutexGuard< + '_, + HashMap>, + >, + ) -> Result<(), TimelineArchivalError> { + let has_archived_parent = if let Some(ancestor_timeline) = timelines.get(&timeline_id) { + ancestor_timeline.is_archived() == Some(true) + } else if offloaded_timelines.contains_key(&ancestor_timeline_id) { + true + } else { + error!("ancestor timeline {ancestor_timeline_id} not found"); + debug_assert!(false); + return Err(TimelineArchivalError::NotFound); + }; + if has_archived_parent { + return Err(TimelineArchivalError::HasArchivedParent( + ancestor_timeline_id, + )); + } + Ok(()) + } + + fn check_to_be_unarchived_timeline_has_no_archived_parent( + timeline: &Arc, + ) -> Result<(), TimelineArchivalError> { + if let Some(ancestor_timeline) = timeline.ancestor_timeline() { + if ancestor_timeline.is_archived() == Some(true) { + return Err(TimelineArchivalError::HasArchivedParent( + ancestor_timeline.timeline_id, + )); + } + } + Ok(()) + } + pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, @@ -1469,6 +1531,7 @@ impl Tenant { ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); + // First part: figure out what is needed to do, and do validation let timeline_or_unarchive_offloaded = 'outer: { let timelines = self.timelines.lock().unwrap(); @@ -1478,61 +1541,31 @@ impl Tenant { return Err(TimelineArchivalError::NotFound); }; if new_state == TimelineArchivalState::Archived { - // It's offloaded, so nothing to do for archival + // It's offloaded already, so nothing to do return Ok(()); } if let Some(ancestor_timeline_id) = offloaded.ancestor_timeline_id { - let has_archived_parent = - if let Some(ancestor_timeline) = timelines.get(&timeline_id) { - ancestor_timeline.is_archived() == Some(true) - } else if offloaded_timelines.contains_key(&ancestor_timeline_id) { - true - } else { - error!("ancestor timeline {ancestor_timeline_id} not found"); - debug_assert!(false); - return Err(TimelineArchivalError::NotFound); - }; - if has_archived_parent { - return Err(TimelineArchivalError::HasArchivedParent( - ancestor_timeline_id, - )); - } + Self::check_ancestor_of_to_be_unarchived_is_not_archived( + timeline_id, + ancestor_timeline_id, + &timelines, + &offloaded_timelines, + )?; } break 'outer None; }; if new_state == TimelineArchivalState::Unarchived { - if let Some(ancestor_timeline) = timeline.ancestor_timeline() { - if ancestor_timeline.is_archived() == Some(true) { - return Err(TimelineArchivalError::HasArchivedParent( - ancestor_timeline.timeline_id, - )); - } - } + Self::check_to_be_unarchived_timeline_has_no_archived_parent(&timeline)?; } if new_state == TimelineArchivalState::Archived { - // Ensure that there are no non-archived child timelines - let children: Vec = timelines - .iter() - .filter_map(|(id, entry)| { - if entry.get_ancestor_timeline_id() != Some(timeline_id) { - return None; - } - if entry.is_archived() == Some(true) { - return None; - } - Some(*id) - }) - .collect(); - - if !children.is_empty() { - return Err(TimelineArchivalError::HasUnarchivedChildren(children)); - } + Self::check_to_be_archived_has_no_unarchived_children(timeline_id, &timelines)?; } Some(Arc::clone(timeline)) }; + // Second part: unarchive timeline (if needed) let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { @@ -1597,6 +1630,7 @@ impl Tenant { } }; + // Third part: upload new timeline archival state and block until it is present in S3 let upload_needed = timeline .remote_client .schedule_index_upload_for_timeline_archival_state(new_state)?; From 48ea9210ab746bc02fc76af5b0815957c17040b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 4 Oct 2024 17:01:11 +0200 Subject: [PATCH 13/17] Move unoffloading into separate function as well --- pageserver/src/tenant.rs | 143 +++++++++++++++++++++------------------ 1 file changed, 78 insertions(+), 65 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0785defe60de..94892bb3bf21 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1524,6 +1524,72 @@ impl Tenant { Ok(()) } + /// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline + async fn unoffload_timeline( + self: &Arc, + timeline_id: TimelineId, + ctx: RequestContext, + ) -> Result, TimelineArchivalError> { + let cancel = self.cancel.clone(); + let timeline_preload = self + .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) + .await; + + let index_part = match timeline_preload.index_part { + Ok(index_part) => { + debug!("remote index part exists for timeline {timeline_id}"); + index_part + } + Err(DownloadError::NotFound) => { + error!(%timeline_id, "index_part not found on remote"); + return Err(TimelineArchivalError::NotFound); + } + Err(e) => { + // Some (possibly ephemeral) error happened during index_part download. + warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); + return Err(TimelineArchivalError::Other( + anyhow::Error::new(e).context("downloading index_part from remote storage"), + )); + } + }; + let index_part = match index_part { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted(_index_part) => { + info!("timeline is deleted according to index_part.json"); + return Err(TimelineArchivalError::NotFound); + } + }; + let remote_metadata = index_part.metadata.clone(); + let timeline_resources = self.build_timeline_resources(timeline_id); + self.load_remote_timeline( + timeline_id, + index_part, + remote_metadata, + timeline_resources, + &ctx, + ) + .await + .with_context(|| { + format!( + "failed to load remote timeline {} for tenant {}", + timeline_id, self.tenant_shard_id + ) + })?; + let timelines = self.timelines.lock().unwrap(); + if let Some(timeline) = timelines.get(&timeline_id) { + let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + if offloaded_timelines.remove(&timeline_id).is_none() { + warn!("timeline already removed from offloaded timelines"); + } + Ok(Arc::clone(timeline)) + } else { + warn!("timeline not available directly after attach"); + Err(TimelineArchivalError::Other(anyhow::anyhow!( + "timeline not available directly after attach" + ))) + } + } + pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, @@ -1555,12 +1621,16 @@ impl Tenant { break 'outer None; }; - if new_state == TimelineArchivalState::Unarchived { - Self::check_to_be_unarchived_timeline_has_no_archived_parent(&timeline)?; - } - - if new_state == TimelineArchivalState::Archived { - Self::check_to_be_archived_has_no_unarchived_children(timeline_id, &timelines)?; + // Do some validation. We release the timelines lock below, so there is potential + // for race conditions: these checks are more present to prevent misunderstandings of + // the API's capabilities, instead of serving as the sole way to defend their invariants. + match new_state { + TimelineArchivalState::Unarchived => { + Self::check_to_be_unarchived_timeline_has_no_archived_parent(&timeline)? + } + TimelineArchivalState::Archived => { + Self::check_to_be_archived_has_no_unarchived_children(timeline_id, &timelines)? + } } Some(Arc::clone(timeline)) }; @@ -1569,65 +1639,8 @@ impl Tenant { let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { - // Unarchive an offloaded timeline - let cancel = self.cancel.clone(); - let timeline_preload = self - .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) - .await; - - let index_part = match timeline_preload.index_part { - Ok(index_part) => { - debug!("remote index part exists for timeline {timeline_id}"); - index_part - } - Err(DownloadError::NotFound) => { - error!(%timeline_id, "index_part not found on remote"); - return Err(TimelineArchivalError::NotFound); - } - Err(e) => { - // Some (possibly ephemeral) error happened during index_part download. - warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); - return Err(TimelineArchivalError::Other( - anyhow::Error::new(e).context("downloading index_part from remote storage"), - )); - } - }; - let index_part = match index_part { - MaybeDeletedIndexPart::IndexPart(index_part) => index_part, - MaybeDeletedIndexPart::Deleted(_index_part) => { - info!("timeline is deleted according to index_part.json"); - return Err(TimelineArchivalError::NotFound); - } - }; - let remote_metadata = index_part.metadata.clone(); - let timeline_resources = self.build_timeline_resources(timeline_id); - self.load_remote_timeline( - timeline_id, - index_part, - remote_metadata, - timeline_resources, - &ctx, - ) - .await - .with_context(|| { - format!( - "failed to load remote timeline {} for tenant {}", - timeline_id, self.tenant_shard_id - ) - })?; - let timelines = self.timelines.lock().unwrap(); - if let Some(timeline) = timelines.get(&timeline_id) { - let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); - if offloaded_timelines.remove(&timeline_id).is_none() { - warn!("timeline already removed from offloaded timelines"); - } - Arc::clone(timeline) - } else { - warn!("timeline not available directly after attach"); - return Err(TimelineArchivalError::Other(anyhow::anyhow!( - "timeline not available directly after attach" - ))); - } + // Turn offloaded timeline into a non-offloaded one + self.unoffload_timeline(timeline_id, ctx).await? }; // Third part: upload new timeline archival state and block until it is present in S3 From dd2300b99db5059304c2aa815ea04d8f68e4251e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 7 Oct 2024 17:44:23 +0200 Subject: [PATCH 14/17] typo --- pageserver/src/tenant.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 94892bb3bf21..241dc2e6b600 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -287,11 +287,11 @@ pub struct Tenant { /// During timeline creation, we first insert the TimelineId to the /// creating map, then `timelines`, then remove it from the creating map. - /// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating` + /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_creating` timelines_creating: std::sync::Mutex>, /// Possibly offloaded and archived timelines - /// **Lock order**: if acquring both, acquire`timelines` before `timelines_offloaded` + /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_offloaded` timelines_offloaded: Mutex>>, // This mutex prevents creation of new timelines during GC. From a6aa996e17e8249719041a90972022b7420a0d3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 7 Oct 2024 17:56:45 +0200 Subject: [PATCH 15/17] fixes --- pageserver/src/tenant.rs | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 241dc2e6b600..930691b4bb6b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1486,7 +1486,6 @@ impl Tenant { } fn check_ancestor_of_to_be_unarchived_is_not_archived( - timeline_id: TimelineId, ancestor_timeline_id: TimelineId, timelines: &std::sync::MutexGuard<'_, HashMap>>, offloaded_timelines: &std::sync::MutexGuard< @@ -1494,15 +1493,18 @@ impl Tenant { HashMap>, >, ) -> Result<(), TimelineArchivalError> { - let has_archived_parent = if let Some(ancestor_timeline) = timelines.get(&timeline_id) { - ancestor_timeline.is_archived() == Some(true) - } else if offloaded_timelines.contains_key(&ancestor_timeline_id) { - true - } else { - error!("ancestor timeline {ancestor_timeline_id} not found"); - debug_assert!(false); - return Err(TimelineArchivalError::NotFound); - }; + let has_archived_parent = + if let Some(ancestor_timeline) = timelines.get(&ancestor_timeline_id) { + ancestor_timeline.is_archived() == Some(true) + } else if offloaded_timelines.contains_key(&ancestor_timeline_id) { + true + } else { + error!("ancestor timeline {ancestor_timeline_id} not found"); + if cfg!(debug_assertions) { + panic!("ancestor timeline {ancestor_timeline_id} not found"); + } + return Err(TimelineArchivalError::NotFound); + }; if has_archived_parent { return Err(TimelineArchivalError::HasArchivedParent( ancestor_timeline_id, @@ -1612,7 +1614,6 @@ impl Tenant { } if let Some(ancestor_timeline_id) = offloaded.ancestor_timeline_id { Self::check_ancestor_of_to_be_unarchived_is_not_archived( - timeline_id, ancestor_timeline_id, &timelines, &offloaded_timelines, From 31f8bc6fa22a04af9deb7ab474cf11ceed15f4fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 7 Oct 2024 18:47:39 +0200 Subject: [PATCH 16/17] rename --- pageserver/src/tenant.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 930691b4bb6b..c8162690943e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2108,12 +2108,12 @@ impl Tenant { .iter() .filter_map(|(timeline_id, timeline)| { let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload()); - let has_no_unarchived_children = { + let has_no_unoffloaded_children = { !timelines .iter() .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id)) }; - let can_offload = can_offload && has_no_unarchived_children; + let can_offload = can_offload && has_no_unoffloaded_children; if (is_active, can_offload) == (false, false) { None } else { From 1665c67046e2746e738e59f8501763496a3873a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 8 Oct 2024 01:00:45 +0200 Subject: [PATCH 17/17] clippy --- pageserver/src/tenant.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c8162690943e..082791a9b281 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1627,7 +1627,7 @@ impl Tenant { // the API's capabilities, instead of serving as the sole way to defend their invariants. match new_state { TimelineArchivalState::Unarchived => { - Self::check_to_be_unarchived_timeline_has_no_archived_parent(&timeline)? + Self::check_to_be_unarchived_timeline_has_no_archived_parent(timeline)? } TimelineArchivalState::Archived => { Self::check_to_be_archived_has_no_unarchived_children(timeline_id, &timelines)?