Skip to content

Commit

Permalink
storcon: track preferred AZ for each tenant shard (#8937)
Browse files Browse the repository at this point in the history
## Problem
We want to do AZ aware scheduling, but don't have enough metadata.

## Summary of changes
Introduce a `preferred_az_id` concept for each managed tenant shard.

In a future PR, the scheduler will use this as a soft preference. 
The idea is to try and keep the shard attachments within the same AZ.
Under the assumption that the compute was placed in the correct AZ,
this reduces the chances of cross AZ trafic from between compute and PS.

In terms of code changes we:
1. Add a new nullable `preferred_az_id` column to the `tenant_shards`
table. Also include an in-memory counterpart.
2. Populate the preferred az on tenant creation and shard splits.
3. Add an endpoint which allows to bulk-set preferred AZs.

(3) gives us the migration path. I'll write a script which queries the
cplane db in the region and sets the preferred az of all shards with an 
active compute to the AZ of said compute. For shards without an active compute, 
I'll use the AZ of the currently attached pageserver
since this is what cplane uses now to schedule computes.
  • Loading branch information
VladLazar authored Sep 6, 2024
1 parent a132323 commit e86fef0
Show file tree
Hide file tree
Showing 10 changed files with 380 additions and 91 deletions.
15 changes: 14 additions & 1 deletion libs/pageserver_api/src/controller_api.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::str::FromStr;
use std::time::{Duration, Instant};

Expand Down Expand Up @@ -74,6 +74,17 @@ pub struct TenantPolicyRequest {
pub scheduling: Option<ShardSchedulingPolicy>,
}

#[derive(Serialize, Deserialize)]
pub struct ShardsPreferredAzsRequest {
#[serde(flatten)]
pub preferred_az_ids: HashMap<TenantShardId, String>,
}

#[derive(Serialize, Deserialize)]
pub struct ShardsPreferredAzsResponse {
pub updated: Vec<TenantShardId>,
}

#[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId,
Expand Down Expand Up @@ -132,6 +143,8 @@ pub struct TenantDescribeResponseShard {
pub is_splitting: bool,

pub scheduling_policy: ShardSchedulingPolicy,

pub preferred_az_id: Option<String>,
}

/// Explicitly migrating a particular shard is a low level operation
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE tenant_shards DROP preferred_az_id;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE tenant_shards ADD preferred_az_id VARCHAR;
21 changes: 20 additions & 1 deletion storage_controller/src/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::controller_api::{
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
TenantCreateRequest,
ShardsPreferredAzsRequest, TenantCreateRequest,
};
use pageserver_api::models::{
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
Expand Down Expand Up @@ -688,6 +688,18 @@ async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<
)
}

async fn handle_update_preferred_azs(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;

let azs_req = json_request::<ShardsPreferredAzsRequest>(&mut req).await?;
let state = get_state(&req);

json_response(
StatusCode::OK,
state.service.update_shards_preferred_azs(azs_req).await?,
)
}

async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;

Expand Down Expand Up @@ -1174,6 +1186,13 @@ pub fn make_router(
RequestName("control_v1_tenant_policy"),
)
})
.put("/control/v1/preferred_azs", |r| {
named_request_span(
r,
handle_update_preferred_azs,
RequestName("control_v1_preferred_azs"),
)
})
.put("/control/v1/step_down", |r| {
named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
})
Expand Down
33 changes: 33 additions & 0 deletions storage_controller/src/persistence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ pub(crate) enum DatabaseOperation {
ListMetadataHealthOutdated,
GetLeader,
UpdateLeader,
SetPreferredAzs,
}

#[must_use]
Expand Down Expand Up @@ -664,6 +665,33 @@ impl Persistence {
Ok(())
}

pub(crate) async fn set_tenant_shard_preferred_azs(
&self,
preferred_azs: Vec<(TenantShardId, String)>,
) -> DatabaseResult<Vec<(TenantShardId, String)>> {
use crate::schema::tenant_shards::dsl::*;

self.with_measured_conn(DatabaseOperation::SetPreferredAzs, move |conn| {
let mut shards_updated = Vec::default();

for (tenant_shard_id, preferred_az) in preferred_azs.iter() {
let updated = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.set(preferred_az_id.eq(preferred_az))
.execute(conn)?;

if updated == 1 {
shards_updated.push((*tenant_shard_id, preferred_az.clone()));
}
}

Ok(shards_updated)
})
.await
}

pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
Expand Down Expand Up @@ -1050,6 +1078,11 @@ pub(crate) struct TenantShardPersistence {
pub(crate) config: String,
#[serde(default)]
pub(crate) scheduling_policy: String,

// Hint that we should attempt to schedule this tenant shard the given
// availability zone in order to minimise the chances of cross-AZ communication
// with compute.
pub(crate) preferred_az_id: Option<String>,
}

impl TenantShardPersistence {
Expand Down
1 change: 1 addition & 0 deletions storage_controller/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ diesel::table! {
splitting -> Int2,
config -> Text,
scheduling_policy -> Varchar,
preferred_az_id -> Nullable<Varchar>,
}
}

Expand Down
Loading

1 comment on commit e86fef0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3924 tests run: 3800 passed, 0 failed, 124 skipped (full report)


Flaky tests (5)

Postgres 16

Postgres 15

Postgres 14

Code coverage* (full report)

  • functions: 31.9% (7345 of 23027 functions)
  • lines: 50.0% (59472 of 118898 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
e86fef0 at 2024-09-06T13:59:54.364Z :recycle:

Please sign in to comment.