diff --git a/Cargo.lock b/Cargo.lock index cde9aa7a77be..fc19a5b3a576 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,12 +82,27 @@ dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", - "anstyle-wincon", + "anstyle-wincon 1.0.1", "colorchoice", "is-terminal", "utf8parse", ] +[[package]] +name = "anstream" +version = "0.6.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon 3.0.4", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + [[package]] name = "anstyle" version = "1.0.8" @@ -122,6 +137,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "anstyle-wincon" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + [[package]] name = "anyhow" version = "1.0.71" @@ -1185,7 +1210,7 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990" dependencies = [ - "anstream", + "anstream 0.3.2", "anstyle", "bitflags 1.3.2", "clap_lex", @@ -1924,6 +1949,15 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "env_filter" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab" +dependencies = [ + "log", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -1937,6 +1971,18 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +dependencies = [ + "anstream 0.6.15", + "anstyle", + "env_filter", + "log", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -2811,6 +2857,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.10.5" @@ -3254,6 +3306,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.1" @@ -3537,6 +3599,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.11.1" @@ -4119,7 +4187,7 @@ dependencies = [ "bindgen", "bytes", "crc32c", - "env_logger", + "env_logger 0.10.2", "log", "memoffset 0.9.0", "once_cell", @@ -4312,7 +4380,7 @@ dependencies = [ "consumption_metrics", "dashmap", "ecdsa 0.16.9", - "env_logger", + "env_logger 0.10.2", "fallible-iterator", "framed-websockets", "futures", @@ -5787,6 +5855,7 @@ dependencies = [ "serde_json", "strum", "strum_macros", + "test-log", "thiserror", "tokio", "tokio-util", @@ -6039,6 +6108,28 @@ dependencies = [ "syn 2.0.52", ] +[[package]] +name = "test-log" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dffced63c2b5c7be278154d76b479f9f9920ed34e7574201407f0b14e2bbb93" +dependencies = [ + "env_logger 0.11.2", + "test-log-macros", + "tracing-subscriber", +] + +[[package]] +name = "test-log-macros" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5999e24eaa32083191ba4e425deb75cdf25efefabe5aaccb7446dd0d4122a3f5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "thiserror" version = "1.0.57" @@ -6570,6 +6661,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "matchers", + "nu-ansi-term", "once_cell", "regex", "serde", @@ -6874,7 +6966,7 @@ dependencies = [ "anyhow", "camino-tempfile", "clap", - "env_logger", + "env_logger 0.10.2", "log", "postgres", "postgres_ffi", diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 9ed0501026dc..4255fc8d1b3e 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -56,3 +56,6 @@ utils = { path = "../libs/utils/" } metrics = { path = "../libs/metrics/" } control_plane = { path = "../control_plane" } workspace_hack = { version = "0.1", path = "../workspace_hack" } + +[dev-dependencies] +test-log = "*" \ No newline at end of file diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 2414d95eb89b..d7cc6db61bee 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -206,6 +206,10 @@ pub(crate) struct NodeSecondarySchedulingScore { /// The number of shards belonging to the tenant currently being /// scheduled that are attached to this node. affinity_score: AffinityScore, + /// Size of [`ScheduleContext::attached_nodes`] for the current node. + /// This normally tracks the number of attached shards belonging to the + /// tenant being scheduled that are already on this node. + secondary_shards_in_context: usize, /// Utilisation score that combines shard count and disk utilisation utilization_score: u64, /// Total number of shards attached to this node. When nodes have identical utilisation, this @@ -231,6 +235,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore { Some(Self { az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())), + secondary_shards_in_context: context.secondary_nodes.get(node_id).copied().unwrap_or(0), affinity_score: context .nodes .get(node_id) @@ -327,6 +332,9 @@ pub(crate) struct ScheduleContext { /// Specifically how many _attached_ locations are on each node pub(crate) attached_nodes: HashMap, + /// Specifically how many _secondary_ locations are on each node + pub(crate) secondary_nodes: HashMap, + pub(crate) mode: ScheduleMode, } @@ -345,6 +353,11 @@ impl ScheduleContext { *entry += 1; } + pub(crate) fn push_secondary(&mut self, node_id: NodeId) { + let entry = self.secondary_nodes.entry(node_id).or_default(); + *entry += 1; + } + pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { self.nodes .get(&node_id) @@ -786,7 +799,14 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { - use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization}; + use pageserver_api::{ + controller_api::NodeAvailability, models::utilization::test_utilization, + shard::ShardIdentity, shard::TenantShardId, + }; + use utils::{ + id::TenantId, + shard::{ShardCount, ShardNumber}, + }; use super::*; @@ -1074,4 +1094,171 @@ mod tests { intent.clear(&mut scheduler); } } + + #[test] + fn repro_foo() { + let az_tag = AvailabilityZone("az-a".to_string()); + + let nodes = test_utils::make_test_nodes( + 5, + &[ + az_tag.clone(), + az_tag.clone(), + az_tag.clone(), + az_tag.clone(), + az_tag.clone(), + ], + ); + let mut scheduler = Scheduler::new(nodes.values()); + + // Need to keep these alive because they contribute to shard counts via RAII + let mut scheduled_shards = Vec::new(); + + let mut context = ScheduleContext::default(); + + fn schedule_shard( + tenant_shard_id: TenantShardId, + expect_attached: NodeId, + expect_secondary: NodeId, + scheduled_shards: &mut Vec, + scheduler: &mut Scheduler, + context: &mut ScheduleContext, + ) { + let shard_identity = ShardIdentity::new( + tenant_shard_id.shard_number, + tenant_shard_id.shard_count, + pageserver_api::shard::ShardStripeSize(1), + ) + .unwrap(); + let mut shard = TenantShard::new( + tenant_shard_id, + shard_identity, + pageserver_api::controller_api::PlacementPolicy::Attached(1), + ); + + shard.schedule(scheduler, context).unwrap(); + + assert_eq!(shard.intent.get_attached().unwrap(), expect_attached); + assert_eq!( + shard.intent.get_secondary().first().unwrap(), + &expect_secondary + ); + + scheduled_shards.push(shard); + } + + let tenant_id = TenantId::generate(); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(2), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(1), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(4), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(2), + shard_count: ShardCount(8), + }, + NodeId(5), + NodeId(1), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(3), + shard_count: ShardCount(8), + }, + NodeId(2), + NodeId(3), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(4), + shard_count: ShardCount(8), + }, + NodeId(4), + NodeId(5), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(5), + shard_count: ShardCount(8), + }, + NodeId(1), + NodeId(2), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(6), + shard_count: ShardCount(8), + }, + NodeId(3), + NodeId(4), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + schedule_shard( + TenantShardId { + tenant_id, + shard_number: ShardNumber(7), + shard_count: ShardCount(8), + }, + NodeId(5), + NodeId(1), + &mut scheduled_shards, + &mut scheduler, + &mut context, + ); + + for shard in &scheduled_shards { + assert_eq!(shard.optimize_attachment(&nodes, &context), None); + } + + for mut shard in scheduled_shards { + shard.intent.clear(&mut scheduler); + } + } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index bd5759422ce6..c3c9f3f9f9b1 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5858,10 +5858,7 @@ impl Service { // Accumulate the schedule context for all the shards in a tenant: we must have // the total view of all shards before we can try to optimize any of them. - schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } + shard.populate_context(&mut schedule_context); tenant_shards.push(shard); // Once we have seen the last shard in the tenant, proceed to search across all shards diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 953c73119bbb..bde6aa488b2d 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -566,10 +566,7 @@ impl TenantShard { ) -> Result<(), ScheduleError> { let r = self.do_schedule(scheduler, context); - context.avoid(&self.intent.all_pageservers()); - if let Some(attached) = self.intent.get_attached() { - context.push_attached(*attached); - } + self.populate_context(context); r } @@ -676,6 +673,19 @@ impl TenantShard { Ok(()) } + /// When building the ScheduleContext of a tenant, call this on each shard to + /// add its contribution to the context. + pub(crate) fn populate_context(&self, context: &mut ScheduleContext) { + context.avoid(&self.intent.all_pageservers()); + + if let Some(attached) = self.intent.get_attached() { + context.push_attached(*attached); + } + for secondary in self.intent.get_secondary() { + context.push_secondary(*secondary); + } + } + /// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error /// if the swap is not possible and leaves the intent state in its original state. /// @@ -823,10 +833,13 @@ impl TenantShard { continue; }; + // TODO: make this AZ aware: secondary should be chosen "As if I am an attachment, but + // in a different AZ to my actual preferred AZ" + // Let the scheduler suggest a node, where it would put us if we were scheduling afresh // This implicitly limits the choice to nodes that are available, and prefers nodes // with lower utilization. - let Ok(candidate_node) = scheduler.schedule_shard::( + let Ok(candidate_node) = scheduler.schedule_shard::( &self.intent.all_pageservers(), &self.preferred_az_id, schedule_context, @@ -1632,10 +1645,8 @@ pub(crate) mod tests { shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); let mut schedule_context = ScheduleContext::default(); - schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); - schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + shard_a.populate_context(&mut schedule_context); + shard_b.populate_context(&mut schedule_context); let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); @@ -1699,10 +1710,8 @@ pub(crate) mod tests { shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); let mut schedule_context = ScheduleContext::default(); - schedule_context.avoid(&shard_a.intent.all_pageservers()); - schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); - schedule_context.avoid(&shard_b.intent.all_pageservers()); - schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + shard_a.populate_context(&mut schedule_context); + shard_b.populate_context(&mut schedule_context); let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context); @@ -1744,10 +1753,7 @@ pub(crate) mod tests { let mut any_changed = false; for shard in shards.iter() { - schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } + shard.populate_context(&mut schedule_context); } for shard in shards.iter_mut() {