Skip to content

Commit

Permalink
storcon: rework preferred secondary selection
Browse files Browse the repository at this point in the history
We prefer secondaries outside our preferred AZ, as
these are not the temporary ones used in optimization
that are likely to have cold caches.

Move it into TenantShard because its behavior is closely
aligned with TenantShard's logic in optimize_attachment
  • Loading branch information
jcsp committed Dec 19, 2024
1 parent 25ae0ad commit 3ee0050
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 32 deletions.
2 changes: 1 addition & 1 deletion storage_controller/src/drain_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ impl TenantShardDrain {
}
}

match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
match tenant_shard.preferred_secondary(scheduler) {
Some(node) => Some(node),
None => {
tracing::warn!(
Expand Down
51 changes: 22 additions & 29 deletions storage_controller/src/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -636,35 +636,6 @@ impl Scheduler {
}
}

/// Where we have several nodes to choose from, for example when picking a secondary location
/// to promote to an attached location, this method may be used to pick the best choice based
/// on the scheduler's knowledge of utilization and availability.
///
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
/// caller can pick a node some other way.
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
if nodes.is_empty() {
return None;
}

// TODO: When the utilization score returned by the pageserver becomes meaningful,
// schedule based on that instead of the shard count.
let node = nodes
.iter()
.map(|node_id| {
let may_schedule = self
.nodes
.get(node_id)
.map(|n| !matches!(n.may_schedule, MaySchedule::No))
.unwrap_or(false);
(*node_id, may_schedule)
})
.max_by_key(|(_n, may_schedule)| *may_schedule);

// If even the preferred node has may_schedule==false, return None
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
}

/// Calculate a single node's score, used in optimizer logic to compare specific
/// nodes' scores.
pub(crate) fn compute_node_score<Score>(
Expand Down Expand Up @@ -835,6 +806,28 @@ impl Scheduler {
self.nodes.get(node_id).map(|n| n.az.clone())
}

/// For use when choosing a preferred secondary location: filter out nodes that are not
/// available, and gather their AZs.
pub(crate) fn filter_usable_nodes(
&self,
nodes: &[NodeId],
) -> Vec<(NodeId, Option<AvailabilityZone>)> {
nodes
.iter()
.filter_map(|node_id| {
let node = self
.nodes
.get(node_id)
.expect("Referenced nodes always exist");
if matches!(node.may_schedule, MaySchedule::Yes(_)) {
Some((*node_id, Some(node.az.clone())))
} else {
None
}
})
.collect()
}

/// Unit test access to internal state
#[cfg(test)]
pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
Expand Down
41 changes: 39 additions & 2 deletions storage_controller/src/tenant_shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ impl TenantShard {
return Ok((false, node_id));
}

if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
if let Some(promote_secondary) = self.preferred_secondary(scheduler) {
// Promote a secondary
tracing::debug!("Promoted secondary {} to attached", promote_secondary);
self.intent.promote_attached(scheduler, promote_secondary);
Expand Down Expand Up @@ -703,7 +703,7 @@ impl TenantShard {
) -> Result<(), ScheduleError> {
let promote_to = match promote_to {
Some(node) => node,
None => match scheduler.node_preferred(self.intent.get_secondary()) {
None => match self.preferred_secondary(scheduler) {
Some(node) => node,
None => {
return Err(ScheduleError::ImpossibleConstraint);
Expand Down Expand Up @@ -1088,6 +1088,43 @@ impl TenantShard {
true
}

/// When a shard has several secondary locations, we need to pick one in situations where
/// we promote one of them to an attached location:
/// - When draining a node for restart
/// - When responding to a node failure
///
/// In this context, 'preferred' does not mean the node with the best scheduling score: instead
/// we want to pick the node which is best for use _temporarily_ while the previous attached location
/// is unavailable (e.g. because it's down or deploying). That means we prefer to use secondary
/// locations in a non-preferred AZ, as they're more likely to have awarm cache than a temporary
/// secondary in the preferred AZ (which are usually only created for migrations, and if they exist
/// they're probably not warmed up yet). The latter behavior is based oni
///
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
/// caller needs to a pick a node some other way.
pub(crate) fn preferred_secondary(&self, scheduler: &Scheduler) -> Option<NodeId> {
let candidates = scheduler.filter_usable_nodes(&self.intent.secondary);

// We will sort candidates to prefer nodes which are _not_ in our preferred AZ, i.e. we prefer
// to migrate to a long-lived secondary location (which would have been scheduled in a non-preferred AZ),
// rather than a short-lived secondary location being used for optimization/migration (which would have
// been scheduled in our preferred AZ).
let mut candidates = candidates
.iter()
.map(|(node_id, node_az)| {
if node_az == &self.preferred_az_id {
(1, *node_id)
} else {
(0, *node_id)
}
})
.collect::<Vec<_>>();

candidates.sort();

candidates.first().map(|i| i.1)
}

/// Query whether the tenant's observed state for attached node matches its intent state, and if so,
/// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that
/// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
Expand Down

0 comments on commit 3ee0050

Please sign in to comment.