From 7b18e33997b861ce92bce9192007f87ab45708a9 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 4 Dec 2024 13:53:52 +0100 Subject: [PATCH] pageserver: return proper status code for heatmap_upload errors (#9991) ## Problem During deploys, we see a lot of 500 errors due to heapmap uploads for inactive tenants. These should be 503s instead. Resolves #9574. ## Summary of changes Make the secondary tenant scheduler use `ApiError` rather than `anyhow::Error`, to propagate the tenant error and convert it to an appropriate status code. --- pageserver/src/http/routes.rs | 28 ++++++++++++---- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/mgr.rs | 5 ++- pageserver/src/tenant/secondary.rs | 33 +++++++++++++++---- pageserver/src/tenant/secondary/downloader.rs | 13 ++++---- .../src/tenant/secondary/heatmap_uploader.rs | 10 +++--- pageserver/src/tenant/secondary/scheduler.rs | 4 +-- 7 files changed, 68 insertions(+), 27 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e127871549ea..e04f1460a8f2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -279,7 +279,10 @@ impl From for ApiError { impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { - GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), + GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()), + GetTenantError::ShardNotFound(tid) => { + ApiError::NotFound(anyhow!("tenant {tid}").into()) + } GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does @@ -387,6 +390,16 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError { + use crate::tenant::secondary::SecondaryTenantError; + match ste { + SecondaryTenantError::GetTenant(gte) => gte.into(), + SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown, + } + } +} + // Helper function to construct a TimelineInfo struct for a timeline async fn build_timeline_info( timeline: &Arc, @@ -1047,9 +1060,11 @@ async fn timeline_delete_handler( match e { // GetTenantError has a built-in conversion to ApiError, but in this context we don't // want to treat missing tenants as 404, to avoid ambiguity with successful deletions. - GetTenantError::NotFound(_) => ApiError::PreconditionFailed( - "Requested tenant is missing".to_string().into_boxed_str(), - ), + GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => { + ApiError::PreconditionFailed( + "Requested tenant is missing".to_string().into_boxed_str(), + ) + } e => e.into(), } })?; @@ -2462,8 +2477,7 @@ async fn secondary_upload_handler( state .secondary_controller .upload_tenant(tenant_shard_id) - .await - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -2578,7 +2592,7 @@ async fn secondary_download_handler( // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered // okay. We could get an error here in the unlikely edge case that the tenant // was detached between our check above and executing the download job. - Ok(Err(e)) => return Err(ApiError::InternalServerError(e)), + Ok(Err(e)) => return Err(e.into()), // A timeout is not an error: we have started the download, we're just not done // yet. The caller will get a response body indicating status. Err(_) => StatusCode::ACCEPTED, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ada5c4a97705..5a9e398586f6 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3422,7 +3422,7 @@ impl Tenant { r.map_err( |_e: tokio::sync::watch::error::RecvError| // Tenant existed but was dropped: report it as non-existent - GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) + GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id)) )? } Err(TimeoutCancellableError::Cancelled) => { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 45481c4ed44e..e8b0d1d4dd64 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -894,7 +894,7 @@ impl TenantManager { Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)), Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), None | Some(TenantSlot::Secondary(_)) => { - Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) + Err(GetTenantError::ShardNotFound(tenant_shard_id)) } } } @@ -2258,6 +2258,9 @@ pub(crate) enum GetTenantError { #[error("Tenant {0} not found")] NotFound(TenantId), + #[error("Tenant {0} not found")] + ShardNotFound(TenantShardId), + #[error("Tenant {0} is not active")] NotActive(TenantShardId), diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 3df89a928cb2..4bc208331b35 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -22,6 +22,7 @@ use super::{ mgr::TenantManager, span::debug_assert_current_span_has_tenant_id, storage_layer::LayerName, + GetTenantError, }; use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; @@ -66,7 +67,21 @@ struct CommandRequest { } struct CommandResponse { - result: anyhow::Result<()>, + result: Result<(), SecondaryTenantError>, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum SecondaryTenantError { + #[error("{0}")] + GetTenant(GetTenantError), + #[error("shutting down")] + ShuttingDown, +} + +impl From for SecondaryTenantError { + fn from(gte: GetTenantError) -> Self { + Self::GetTenant(gte) + } } // Whereas [`Tenant`] represents an attached tenant, this type represents the work @@ -285,7 +300,7 @@ impl SecondaryController { &self, queue: &tokio::sync::mpsc::Sender>, payload: T, - ) -> anyhow::Result<()> { + ) -> Result<(), SecondaryTenantError> { let (response_tx, response_rx) = tokio::sync::oneshot::channel(); queue @@ -294,20 +309,26 @@ impl SecondaryController { response_tx, }) .await - .map_err(|_| anyhow::anyhow!("Receiver shut down"))?; + .map_err(|_| SecondaryTenantError::ShuttingDown)?; let response = response_rx .await - .map_err(|_| anyhow::anyhow!("Request dropped"))?; + .map_err(|_| SecondaryTenantError::ShuttingDown)?; response.result } - pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + pub(crate) async fn upload_tenant( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), SecondaryTenantError> { self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id)) .await } - pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + pub(crate) async fn download_tenant( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), SecondaryTenantError> { self.dispatch( &self.download_req_tx, DownloadCommand::Download(tenant_shard_id), diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 8d771dc40535..701e4cf04b49 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -35,7 +35,7 @@ use super::{ self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, }, - SecondaryTenant, + GetTenantError, SecondaryTenant, SecondaryTenantError, }; use crate::tenant::{ @@ -470,15 +470,16 @@ impl JobGenerator anyhow::Result { + fn on_command( + &mut self, + command: DownloadCommand, + ) -> Result { let tenant_shard_id = command.get_tenant_shard_id(); let tenant = self .tenant_manager - .get_secondary_tenant_shard(*tenant_shard_id); - let Some(tenant) = tenant else { - return Err(anyhow::anyhow!("Not found or not in Secondary mode")); - }; + .get_secondary_tenant_shard(*tenant_shard_id) + .ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?; Ok(PendingDownload { target_time: None, diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index e680fd705b42..c5e5e0494571 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -28,7 +28,7 @@ use super::{ self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, }, - CommandRequest, UploadCommand, + CommandRequest, SecondaryTenantError, UploadCommand, }; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, Instrument}; @@ -279,7 +279,10 @@ impl JobGenerator }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) } - fn on_command(&mut self, command: UploadCommand) -> anyhow::Result { + fn on_command( + &mut self, + command: UploadCommand, + ) -> Result { let tenant_shard_id = command.get_tenant_shard_id(); tracing::info!( @@ -287,8 +290,7 @@ impl JobGenerator "Starting heatmap write on command"); let tenant = self .tenant_manager - .get_attached_tenant_shard(*tenant_shard_id) - .map_err(|e| anyhow::anyhow!(e))?; + .get_attached_tenant_shard(*tenant_shard_id)?; if !tenant.is_active() { return Err(GetTenantError::NotActive(*tenant_shard_id).into()); } diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 28cf2125dfd0..e963c722b97a 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -12,7 +12,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::{completion::Barrier, yielding_loop::yielding_loop}; -use super::{CommandRequest, CommandResponse}; +use super::{CommandRequest, CommandResponse, SecondaryTenantError}; /// Scheduling interval is the time between calls to JobGenerator::schedule. /// When we schedule jobs, the job generator may provide a hint of its preferred @@ -112,7 +112,7 @@ where /// Called when a command is received. A job will be spawned immediately if the return /// value is Some, ignoring concurrency limits and the pending queue. - fn on_command(&mut self, cmd: CMD) -> anyhow::Result; + fn on_command(&mut self, cmd: CMD) -> Result; } /// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling