Skip to content

Commit

Permalink
storage controller: fix a disagreement between schedule + optimize
Browse files Browse the repository at this point in the history
Closes: #8969
  • Loading branch information
jcsp committed Oct 3, 2024
1 parent dbef1b0 commit b66bf89
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 21 deletions.
189 changes: 188 additions & 1 deletion storage_controller/src/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ pub(crate) struct NodeSecondarySchedulingScore {
/// The number of shards belonging to the tenant currently being
/// scheduled that are attached to this node.
affinity_score: AffinityScore,
/// Size of [`ScheduleContext::attached_nodes`] for the current node.
/// This normally tracks the number of attached shards belonging to the
/// tenant being scheduled that are already on this node.
secondary_shards_in_context: usize,
/// Utilisation score that combines shard count and disk utilisation
utilization_score: u64,
/// Total number of shards attached to this node. When nodes have identical utilisation, this
Expand All @@ -231,6 +235,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {

Some(Self {
az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
secondary_shards_in_context: context.secondary_nodes.get(node_id).copied().unwrap_or(0),
affinity_score: context
.nodes
.get(node_id)
Expand Down Expand Up @@ -327,6 +332,9 @@ pub(crate) struct ScheduleContext {
/// Specifically how many _attached_ locations are on each node
pub(crate) attached_nodes: HashMap<NodeId, usize>,

/// Specifically how many _secondary_ locations are on each node
pub(crate) secondary_nodes: HashMap<NodeId, usize>,

pub(crate) mode: ScheduleMode,
}

Expand All @@ -345,6 +353,11 @@ impl ScheduleContext {
*entry += 1;
}

pub(crate) fn push_secondary(&mut self, node_id: NodeId) {
let entry = self.secondary_nodes.entry(node_id).or_default();
*entry += 1;
}

pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
self.nodes
.get(&node_id)
Expand Down Expand Up @@ -786,7 +799,14 @@ pub(crate) mod test_utils {

#[cfg(test)]
mod tests {
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
use pageserver_api::{
controller_api::NodeAvailability, models::utilization::test_utilization,
shard::ShardIdentity, shard::TenantShardId,
};
use utils::{
id::TenantId,
shard::{ShardCount, ShardNumber},
};

use super::*;

Expand Down Expand Up @@ -1074,4 +1094,171 @@ mod tests {
intent.clear(&mut scheduler);
}
}

#[test]
fn repro_foo() {
let az_tag = AvailabilityZone("az-a".to_string());

let nodes = test_utils::make_test_nodes(
5,
&[
az_tag.clone(),
az_tag.clone(),
az_tag.clone(),
az_tag.clone(),
az_tag.clone(),
],
);
let mut scheduler = Scheduler::new(nodes.values());

// Need to keep these alive because they contribute to shard counts via RAII
let mut scheduled_shards = Vec::new();

let mut context = ScheduleContext::default();

fn schedule_shard(
tenant_shard_id: TenantShardId,
expect_attached: NodeId,
expect_secondary: NodeId,
scheduled_shards: &mut Vec<TenantShard>,
scheduler: &mut Scheduler,
context: &mut ScheduleContext,
) {
let shard_identity = ShardIdentity::new(
tenant_shard_id.shard_number,
tenant_shard_id.shard_count,
pageserver_api::shard::ShardStripeSize(1),
)
.unwrap();
let mut shard = TenantShard::new(
tenant_shard_id,
shard_identity,
pageserver_api::controller_api::PlacementPolicy::Attached(1),
);

shard.schedule(scheduler, context).unwrap();

assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
assert_eq!(
shard.intent.get_secondary().first().unwrap(),
&expect_secondary
);

scheduled_shards.push(shard);
}

let tenant_id = TenantId::generate();

schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(0),
shard_count: ShardCount(8),
},
NodeId(1),
NodeId(2),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);

schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(1),
shard_count: ShardCount(8),
},
NodeId(3),
NodeId(4),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);

schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(2),
shard_count: ShardCount(8),
},
NodeId(5),
NodeId(1),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);

schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(3),
shard_count: ShardCount(8),
},
NodeId(2),
NodeId(3),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);

schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(4),
shard_count: ShardCount(8),
},
NodeId(4),
NodeId(5),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);

schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(5),
shard_count: ShardCount(8),
},
NodeId(1),
NodeId(2),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);

schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(6),
shard_count: ShardCount(8),
},
NodeId(3),
NodeId(4),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);

schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(7),
shard_count: ShardCount(8),
},
NodeId(5),
NodeId(1),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);

for shard in &scheduled_shards {
assert_eq!(shard.optimize_attachment(&nodes, &context), None);
}

for mut shard in scheduled_shards {
shard.intent.clear(&mut scheduler);
}
}
}
5 changes: 1 addition & 4 deletions storage_controller/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5858,10 +5858,7 @@ impl Service {

// Accumulate the schedule context for all the shards in a tenant: we must have
// the total view of all shards before we can try to optimize any of them.
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
shard.populate_context(&mut schedule_context);
tenant_shards.push(shard);

// Once we have seen the last shard in the tenant, proceed to search across all shards
Expand Down
35 changes: 19 additions & 16 deletions storage_controller/src/tenant_shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -566,10 +566,7 @@ impl TenantShard {
) -> Result<(), ScheduleError> {
let r = self.do_schedule(scheduler, context);

context.avoid(&self.intent.all_pageservers());
if let Some(attached) = self.intent.get_attached() {
context.push_attached(*attached);
}
self.populate_context(context);

r
}
Expand Down Expand Up @@ -676,6 +673,19 @@ impl TenantShard {
Ok(())
}

/// When building the ScheduleContext of a tenant, call this on each shard to
/// add its contribution to the context.
pub(crate) fn populate_context(&self, context: &mut ScheduleContext) {
context.avoid(&self.intent.all_pageservers());

if let Some(attached) = self.intent.get_attached() {
context.push_attached(*attached);
}
for secondary in self.intent.get_secondary() {
context.push_secondary(*secondary);
}
}

/// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
/// if the swap is not possible and leaves the intent state in its original state.
///
Expand Down Expand Up @@ -1632,10 +1642,8 @@ pub(crate) mod tests {
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));

let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
schedule_context.avoid(&shard_b.intent.all_pageservers());
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
shard_a.populate_context(&mut schedule_context);
shard_b.populate_context(&mut schedule_context);

let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);

Expand Down Expand Up @@ -1699,10 +1707,8 @@ pub(crate) mod tests {
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));

let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
schedule_context.avoid(&shard_b.intent.all_pageservers());
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
shard_a.populate_context(&mut schedule_context);
shard_b.populate_context(&mut schedule_context);

let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);

Expand Down Expand Up @@ -1744,10 +1750,7 @@ pub(crate) mod tests {
let mut any_changed = false;

for shard in shards.iter() {
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
shard.populate_context(&mut schedule_context);
}

for shard in shards.iter_mut() {
Expand Down

0 comments on commit b66bf89

Please sign in to comment.