Skip to content

Commit

Permalink
metrics for resident & remote physical size without tenant/timeline d…
Browse files Browse the repository at this point in the history
…imension (#5389)

So that we can compute worst-case /storage size dashboard panel more
cheaply.
  • Loading branch information
problame authored Sep 27, 2023
1 parent 2cced77 commit 090a644
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 22 deletions.
97 changes: 87 additions & 10 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,14 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});

pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_resident_physical_size_global",
"Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
)
.expect("failed to define a metric")
});

static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_remote_physical_size",
Expand All @@ -301,6 +309,14 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});

static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_remote_physical_size_global",
"Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
)
.expect("failed to define a metric")
});

pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_remote_ondemand_downloaded_layers_total",
Expand Down Expand Up @@ -1209,7 +1225,7 @@ pub struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
Expand Down Expand Up @@ -1287,18 +1303,40 @@ impl TimelineMetrics {
}

pub fn record_new_file_metrics(&self, sz: u64) {
self.resident_physical_size_gauge.add(sz);
self.resident_physical_size_add(sz);
self.num_persistent_files_created.inc_by(1);
self.persistent_bytes_written.inc_by(sz);
}

pub fn resident_physical_size_sub(&self, sz: u64) {
self.resident_physical_size_gauge.sub(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
}

pub fn resident_physical_size_add(&self, sz: u64) {
self.resident_physical_size_gauge.add(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
}

pub fn resident_physical_size_set(&self, sz: u64) {
self.resident_physical_size_gauge.set(sz);
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
}

pub fn resident_physical_size_get(&self) -> u64 {
self.resident_physical_size_gauge.get()
}
}

impl Drop for TimelineMetrics {
fn drop(&mut self) {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
}
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
Expand Down Expand Up @@ -1352,10 +1390,43 @@ use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;

/// Maintain a per timeline gauge in addition to the global gauge.
struct PerTimelineRemotePhysicalSizeGauge {
last_set: u64,
gauge: UIntGauge,
}

impl PerTimelineRemotePhysicalSizeGauge {
fn new(per_timeline_gauge: UIntGauge) -> Self {
Self {
last_set: per_timeline_gauge.get(),
gauge: per_timeline_gauge,
}
}
fn set(&mut self, sz: u64) {
self.gauge.set(sz);
if sz < self.last_set {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
} else {
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
};
self.last_set = sz;
}
fn get(&self) -> u64 {
self.gauge.get()
}
}

impl Drop for PerTimelineRemotePhysicalSizeGauge {
fn drop(&mut self) {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
}
}

pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
Expand All @@ -1373,18 +1444,24 @@ impl RemoteTimelineClientMetrics {
}
}

pub fn remote_physical_size_gauge(&self) -> UIntGauge {
pub(crate) fn remote_physical_size_set(&self, sz: u64) {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
guard
.get_or_insert_with(|| {
let gauge = guard.get_or_insert_with(|| {
PerTimelineRemotePhysicalSizeGauge::new(
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
])
.unwrap()
})
.clone()
.unwrap(),
)
});
gauge.set(sz);
}

pub(crate) fn remote_physical_size_get(&self) -> u64 {
let guard = self.remote_physical_size_gauge.lock().unwrap();
guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
}

pub fn remote_operation_time(
Expand Down
4 changes: 2 additions & 2 deletions pageserver/src/tenant/remote_timeline_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,11 +453,11 @@ impl RemoteTimelineClient {
} else {
0
};
self.metrics.remote_physical_size_gauge().set(size);
self.metrics.remote_physical_size_set(size);
}

pub fn get_remote_physical_size(&self) -> u64 {
self.metrics.remote_physical_size_gauge().get()
self.metrics.remote_physical_size_get()
}

//
Expand Down
13 changes: 4 additions & 9 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ impl Timeline {
}

pub fn resident_physical_size(&self) -> u64 {
self.metrics.resident_physical_size_gauge.get()
self.metrics.resident_physical_size_get()
}

///
Expand Down Expand Up @@ -1309,10 +1309,7 @@ impl Timeline {
// will treat the file as a local layer again, count it towards resident size,
// and it'll be like the layer removal never happened.
// The bump in resident size is perhaps unexpected but overall a robust behavior.
self.metrics
.resident_physical_size_gauge
.sub(layer_file_size);

self.metrics.resident_physical_size_sub(layer_file_size);
self.metrics.evictions.inc();

if let Some(delta) = local_layer_residence_duration {
Expand Down Expand Up @@ -1846,9 +1843,7 @@ impl Timeline {
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
);
self.metrics
.resident_physical_size_gauge
.set(total_physical_size);
self.metrics.resident_physical_size_set(total_physical_size);

timer.stop_and_record();
Ok(())
Expand Down Expand Up @@ -4398,7 +4393,7 @@ impl Timeline {

// XXX the temp file is still around in Err() case
// and consumes space until we clean up upon pageserver restart.
self_clone.metrics.resident_physical_size_gauge.add(*size);
self_clone.metrics.resident_physical_size_add(*size);

// Download complete. Replace the RemoteLayer with the corresponding
// Delta- or ImageLayer in the layer map.
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/tenant/timeline/layer_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ impl LayerManager {
let desc = layer.layer_desc();
if !layer.is_remote_layer() {
layer.delete_resident_layer_file()?;
metrics.resident_physical_size_gauge.sub(desc.file_size);
metrics.resident_physical_size_sub(desc.file_size);
}

// TODO Removing from the bottom of the layer map is expensive.
Expand Down

1 comment on commit 090a644

@github-actions
Copy link

@github-actions github-actions bot commented on 090a644 Sep 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2588 tests run: 2466 passed, 0 failed, 122 skipped (full report)


Flaky tests (5)

Postgres 16

  • test_pageserver_recovery: debug
  • test_tenant_detach_smoke: debug
  • test_get_tenant_size_with_multiple_branches: debug
  • test_delete_timeline_exercise_crash_safety_failpoints[Check.RETRY_WITHOUT_RESTART-mock_s3-timeline-delete-before-index-delete]: debug

Postgres 14

  • test_get_tenant_size_with_multiple_branches: debug

Code coverage (full report)

  • functions: 53.0% (8042 of 15164 functions)
  • lines: 81.3% (47096 of 57957 lines)

The comment gets automatically updated with the latest test results
090a644 at 2023-09-27T15:16:25.635Z :recycle:

Please sign in to comment.