Skip to content

Commit

Permalink
pageserver: apply 5 fails -> 1hr wait circuit breaker to compaction
Browse files Browse the repository at this point in the history
  • Loading branch information
jcsp committed Jul 11, 2024
1 parent 415b543 commit 3dfe1e5
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 2 deletions.
16 changes: 16 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,22 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});

pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_circuit_breaker_broken",
"How many times a circuit breaker has broken"
)
.expect("failed to define a metric")
});

pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_circuit_breaker_unbroken",
"How many times a circuit breaker has been un-broken (recovered)"
)
.expect("failed to define a metric")
});

pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
Expand Down
36 changes: 34 additions & 2 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::backoff;
use utils::circuit_breaker::CircuitBreaker;
use utils::completion;
use utils::crashsafe::path_with_suffix_extension;
use utils::failpoint_support;
Expand Down Expand Up @@ -76,7 +77,8 @@ use crate::is_uninit_mark;
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::TENANT;
use crate::metrics::{
remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
};
use crate::repository::GcResult;
use crate::task_mgr;
Expand Down Expand Up @@ -276,6 +278,10 @@ pub struct Tenant {

eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

/// Track repeated failures to compact, so that we can back off.
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,

/// If the tenant is in Activating state, notify this to encourage it
/// to proceed to Active as soon as possible, rather than waiting for lazy
/// background warmup.
Expand Down Expand Up @@ -1641,13 +1647,31 @@ impl Tenant {
timelines_to_compact
};

// Before doing any I/O work, check our circuit breaker
if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
info!("Skipping compaction due to previous failures");
return Ok(());
}

for (timeline_id, timeline) in &timelines_to_compact {
timeline
.compact(cancel, EnumSet::empty(), ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await?;
.await
.map_err(|e| {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, &e);
e
})?;
}

self.compaction_circuit_breaker
.lock()
.unwrap()
.success(&CIRCUIT_BREAKERS_UNBROKEN);

Ok(())
}

Expand Down Expand Up @@ -2570,6 +2594,14 @@ impl Tenant {
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
format!("compaction-{tenant_shard_id}"),
5,
// Compaction can be a very expensive operation, and might leak disk space. It also ought
// to be infallible, as long as remote storage is available. So if it repeatedly fails,
// use an extremely long backoff.
Some(Duration::from_secs(3600 * 24)),
)),
activate_now_sem: tokio::sync::Semaphore::new(0),
cancel: CancellationToken::default(),
gate: Gate::default(),
Expand Down

0 comments on commit 3dfe1e5

Please sign in to comment.