Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pageserver: add supplementary branch usage stats #8131

Merged
merged 1 commit into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions libs/pageserver_api/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,16 @@ pub struct TimelineInfo {
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
pub current_logical_size_non_incremental: Option<u64>,

/// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes
/// beyond the branch's branch point, we only count up to the branch point.
pub pitr_history_size: u64,

/// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
/// ancestor data used by this branch would have been retained anyway). If this is false, then
/// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
/// otherwise be able to GC.
pub within_ancestor_pitr: bool,

pub timeline_dir_layer_file_size_sum: Option<u64>,

pub wal_source_connstr: Option<String>,
Expand Down
4 changes: 4 additions & 0 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,8 @@ async fn build_timeline_info_common(

let walreceiver_status = timeline.walreceiver_status();

let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();

let info = TimelineInfo {
tenant_id: timeline.tenant_shard_id,
timeline_id: timeline.timeline_id,
Expand All @@ -426,6 +428,8 @@ async fn build_timeline_info_common(
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
current_physical_size,
current_logical_size_non_incremental: None,
pitr_history_size,
within_ancestor_pitr,
timeline_dir_layer_file_size_sum: None,
wal_source_connstr,
last_received_msg_lsn,
Expand Down
35 changes: 35 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});

static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_pitr_history_size",
"Data written since PITR cutoff on this timeline",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});

static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_archive_size",
"Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});

static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_standby_horizon",
Expand Down Expand Up @@ -2102,6 +2120,8 @@ pub(crate) struct TimelineMetrics {
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
Expand Down Expand Up @@ -2175,6 +2195,15 @@ impl TimelineMetrics {
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

let pitr_history_size = PITR_HISTORY_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

let archival_size = TIMELINE_ARCHIVE_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
Expand Down Expand Up @@ -2227,6 +2256,8 @@ impl TimelineMetrics {
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_gauge,
pitr_history_size,
archival_size,
standby_horizon_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
Expand Down Expand Up @@ -2284,6 +2315,10 @@ impl TimelineMetrics {
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}

let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
Expand Down
27 changes: 27 additions & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2868,6 +2868,7 @@ impl Tenant {
{
let mut target = timeline.gc_info.write().unwrap();

// Cull any expired leases
let now = SystemTime::now();
target.leases.retain(|_, lease| !lease.is_expired(&now));

Expand All @@ -2876,6 +2877,31 @@ impl Tenant {
.valid_lsn_lease_count_gauge
.set(target.leases.len() as u64);

// Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
target.within_ancestor_pitr =
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
}
}

// Update metrics that depend on GC state
timeline
.metrics
.archival_size
.set(if target.within_ancestor_pitr {
timeline.metrics.current_logical_size_gauge.get()
} else {
0
});
timeline.metrics.pitr_history_size.set(
timeline
.get_last_record_lsn()
.checked_sub(target.cutoffs.pitr)
jcsp marked this conversation as resolved.
Show resolved Hide resolved
.unwrap_or(Lsn(0))
.0,
);

match gc_cutoffs.remove(&timeline.timeline_id) {
Some(cutoffs) => {
target.retain_lsns = branchpoints;
Expand Down Expand Up @@ -7046,6 +7072,7 @@ mod tests {
horizon: Lsn(0x30),
},
leases: Default::default(),
within_ancestor_pitr: false,
};
}

Expand Down
15 changes: 15 additions & 0 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,9 @@ pub(crate) struct GcInfo {

/// Leases granted to particular LSNs.
pub(crate) leases: BTreeMap<Lsn, LsnLease>,

/// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
pub(crate) within_ancestor_pitr: bool,
}

impl GcInfo {
Expand Down Expand Up @@ -851,6 +854,18 @@ impl Timeline {
.map(|ancestor| ancestor.timeline_id)
}

/// Get the bytes written since the PITR cutoff on this branch, and
/// whether this branch's ancestor_lsn is within its parent's PITR.
pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
let gc_info = self.gc_info.read().unwrap();
let history = self
.get_last_record_lsn()
.checked_sub(gc_info.cutoffs.pitr)
.unwrap_or(Lsn(0))
.0;
(history, gc_info.within_ancestor_pitr)
}

/// Lock and get timeline's GC cutoff
pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read()
Expand Down
2 changes: 2 additions & 0 deletions test_runner/fixtures/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
"pageserver_smgr_query_seconds_sum",
"pageserver_archive_size",
"pageserver_pitr_history_size",
"pageserver_storage_operations_seconds_count_total",
"pageserver_storage_operations_seconds_sum_total",
"pageserver_evictions_total",
Expand Down
Loading