Skip to content

Commit

Permalink
storcon: add metric for AZ scheduling violations
Browse files Browse the repository at this point in the history
  • Loading branch information
jcsp committed Nov 29, 2024
1 parent 1d642d6 commit 509d032
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 0 deletions.
6 changes: 6 additions & 0 deletions storage_controller/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ pub(crate) struct StorageControllerMetricGroup {
/// Count of how many times we make an optimization change to a tenant's scheduling
pub(crate) storage_controller_schedule_optimization: measured::Counter,

/// How many shards are not scheduled into their preferred AZ
pub(crate) storage_controller_schedule_az_violation: measured::Gauge,

/// How many shards would like to reconcile but were blocked by concurrency limits
pub(crate) storage_controller_pending_reconciles: measured::Gauge,

/// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
Expand Down
32 changes: 32 additions & 0 deletions storage_controller/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6013,30 +6013,62 @@ impl Service {

let mut schedule_context = ScheduleContext::default();

// This function is an efficient place to update lazy statistics, since we are walking
// all tenants.
let mut pending_reconciles = 0;
let mut az_violations = 0;

let mut reconciles_spawned = 0;
for (tenant_shard_id, shard) in tenants.iter_mut() {
if tenant_shard_id.is_shard_zero() {
schedule_context = ScheduleContext::default();
}

// Accumulate scheduling statistics
if let (Some(attached), Some(preferred)) =
(shard.intent.get_attached(), shard.preferred_az())
{
let node_az = nodes
.get(attached)
.expect("Nodes exist if referenced")
.get_availability_zone_id();
if node_az != preferred {
az_violations += 1;
}
}

// Skip checking if this shard is already enqueued for reconciliation
if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
// If there is something delayed, then return a nonzero count so that
// callers like reconcile_all_now do not incorrectly get the impression
// that the system is in a quiescent state.
reconciles_spawned = std::cmp::max(1, reconciles_spawned);
pending_reconciles += 1;
continue;
}

// Eventual consistency: if an earlier reconcile job failed, and the shard is still
// dirty, spawn another rone
if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
reconciles_spawned += 1;
} else if shard.delayed_reconcile {
// Shard wanted to reconcile but for some reason couldn't.
pending_reconciles += 1;
}

schedule_context.avoid(&shard.intent.all_pageservers());
}

metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_schedule_az_violation
.set(az_violations as i64);

metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_pending_reconciles
.set(pending_reconciles as i64);

reconciles_spawned
}

Expand Down

0 comments on commit 509d032

Please sign in to comment.