diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 2414d95eb89b..d7cc6db61bee 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -206,6 +206,10 @@ pub(crate) struct NodeSecondarySchedulingScore { /// The number of shards belonging to the tenant currently being /// scheduled that are attached to this node. affinity_score: AffinityScore, + /// Size of [`ScheduleContext::attached_nodes`] for the current node. + /// This normally tracks the number of attached shards belonging to the + /// tenant being scheduled that are already on this node. + secondary_shards_in_context: usize, /// Utilisation score that combines shard count and disk utilisation utilization_score: u64, /// Total number of shards attached to this node. When nodes have identical utilisation, this @@ -231,6 +235,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore { Some(Self { az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())), + secondary_shards_in_context: context.secondary_nodes.get(node_id).copied().unwrap_or(0), affinity_score: context .nodes .get(node_id) @@ -327,6 +332,9 @@ pub(crate) struct ScheduleContext { /// Specifically how many _attached_ locations are on each node pub(crate) attached_nodes: HashMap, + /// Specifically how many _secondary_ locations are on each node + pub(crate) secondary_nodes: HashMap, + pub(crate) mode: ScheduleMode, } @@ -345,6 +353,11 @@ impl ScheduleContext { *entry += 1; } + pub(crate) fn push_secondary(&mut self, node_id: NodeId) { + let entry = self.secondary_nodes.entry(node_id).or_default(); + *entry += 1; + } + pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { self.nodes .get(&node_id) @@ -786,7 +799,14 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + use pageserver_api::{ + controller_api::NodeAvailability, models::utilization::test_utilization, + shard::ShardIdentity, shard::TenantShardId, + }; + use utils::{ + id::TenantId, + shard::{ShardCount, ShardNumber}, + }; use super::*; @@ -1074,4 +1094,171 @@ mod tests { intent.clear(&mut scheduler); } } + + #[test] + fn repro_foo() { + let az_tag = AvailabilityZone("az-a".to_string()); + + let nodes = test_utils::make_test_nodes( + 5, + &[ + az_tag.clone(), + az_tag.clone(), + az_tag.clone(), + az_tag.clone(), + az_tag.clone(), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Need to keep these alive because they contribute to shard counts via RAII + let mut scheduled_shards = Vec::new(); + + let mut context = ScheduleContext::default(); + + fn schedule_shard( + tenant_shard_id: TenantShardId, + expect_attached: NodeId, + expect_secondary: NodeId, + scheduled_shards: &mut Vec, + scheduler: &mut Scheduler, + context: &mut ScheduleContext, + ) { + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + ); + + shard.schedule(scheduler, context).unwrap(); + + assert_eq!(shard.intent.get_attached().unwrap(), expect_attached); + assert_eq!( + shard.intent.get_secondary().first().unwrap(), + &expect_secondary + ); + + scheduled_shards.push(shard); + } + + let tenant_id = TenantId::generate(); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(2), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(1), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(4), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(2), + shard_count: ShardCount(8), + }, + NodeId(5), + NodeId(1), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(3), + shard_count: ShardCount(8), + }, + NodeId(2), + NodeId(3), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(4), + shard_count: ShardCount(8), + }, + NodeId(4), + NodeId(5), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(5), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(2), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(6), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(4), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(7), + shard_count: ShardCount(8), + }, + NodeId(5), + NodeId(1), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + for shard in &scheduled_shards { + assert_eq!(shard.optimize_attachment(&nodes, &context), None); + } + + for mut shard in scheduled_shards { + shard.intent.clear(&mut scheduler); + } + } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index bd5759422ce6..c3c9f3f9f9b1 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5858,10 +5858,7 @@ impl Service { // Accumulate the schedule context for all the shards in a tenant: we must have // the total view of all shards before we can try to optimize any of them. - schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } + shard.populate_context(&mut schedule_context); tenant_shards.push(shard); // Once we have seen the last shard in the tenant, proceed to search across all shards diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 953c73119bbb..fd189e495183 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -566,10 +566,7 @@ impl TenantShard { ) -> Result<(), ScheduleError> { let r = self.do_schedule(scheduler, context); - context.avoid(&self.intent.all_pageservers()); - if let Some(attached) = self.intent.get_attached() { - context.push_attached(*attached); - } + self.populate_context(context); r } @@ -676,6 +673,19 @@ impl TenantShard { Ok(()) } + /// When building the ScheduleContext of a tenant, call this on each shard to + /// add its contribution to the context. + pub(crate) fn populate_context(&self, context: &mut ScheduleContext) { + context.avoid(&self.intent.all_pageservers()); + + if let Some(attached) = self.intent.get_attached() { + context.push_attached(*attached); + } + for secondary in self.intent.get_secondary() { + context.push_secondary(*secondary); + } + } + /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error /// if the swap is not possible and leaves the intent state in its original state. /// @@ -1632,10 +1642,8 @@ pub(crate) mod tests { shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); let mut schedule_context = ScheduleContext::default(); - schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); - schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + shard_a.populate_context(&mut schedule_context); + shard_b.populate_context(&mut schedule_context); let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); @@ -1699,10 +1707,8 @@ pub(crate) mod tests { shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); let mut schedule_context = ScheduleContext::default(); - schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); - schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + shard_a.populate_context(&mut schedule_context); + shard_b.populate_context(&mut schedule_context); let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context); @@ -1744,10 +1750,7 @@ pub(crate) mod tests { let mut any_changed = false; for shard in shards.iter() { - schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } + shard.populate_context(&mut schedule_context); } for shard in shards.iter_mut() {