Skip to content

Commit

Permalink
report to prometheus
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Chi Z <[email protected]>
  • Loading branch information
skyzh committed May 7, 2024
1 parent 7b6636b commit 87dbd04
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 16 deletions.
18 changes: 12 additions & 6 deletions pageserver/src/aux_file.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
use std::sync::{
atomic::{AtomicIsize, AtomicUsize},
Arc,
};
use std::sync::Arc;

use ::metrics::IntGauge;
use bytes::{Buf, BufMut, Bytes};
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
use tracing::warn;
Expand Down Expand Up @@ -147,43 +145,51 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {

/// An estimation of the size of aux files.
pub struct AuxFileSizeEstimator {
aux_file_size_gauge: IntGauge,
size: Arc<std::sync::Mutex<Option<isize>>>,
}

impl AuxFileSizeEstimator {
pub fn new() -> Self {
pub fn new(aux_file_size_gauge: IntGauge) -> Self {
Self {
aux_file_size_gauge,
size: Arc::new(std::sync::Mutex::new(None)),
}
}

pub fn on_base_backup(&self, new_size: usize) {
let mut guard = self.size.lock().unwrap();
*guard = Some(new_size as isize);
self.report(new_size as isize);
}

pub fn on_add(&self, file_size: usize) {
let mut guard = self.size.lock().unwrap();
if let Some(size) = &mut *guard {
*size += file_size as isize;
self.report(*size);
}
}

pub fn on_remove(&self, file_size: usize) {
let mut guard = self.size.lock().unwrap();
if let Some(size) = &mut *guard {
*size -= file_size as isize;
self.report(*size);
}
}

pub fn on_update(&self, old_size: usize, new_size: usize) {
let mut guard = self.size.lock().unwrap();
if let Some(size) = &mut *guard {
*size += new_size as isize - old_size as isize;
self.report(*size);
}
}

pub fn report(&self) {}
pub fn report(&self, size: isize) {
self.aux_file_size_gauge.set(size as i64);
}
}

#[cfg(test)]
Expand Down
15 changes: 15 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,15 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define current logical size metric")
});

static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_aux_file_estimated_size",
"The size of all aux files for a timeline in aux file v2 store.",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});

pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
Expand Down Expand Up @@ -2115,6 +2124,7 @@ pub(crate) struct TimelineMetrics {
resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub aux_file_size_gauge: IntGauge,
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
pub evictions: IntCounter,
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
Expand Down Expand Up @@ -2187,6 +2197,9 @@ impl TimelineMetrics {
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let aux_file_size_gauge = AUX_FILE_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
// TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
let directory_entries_count_gauge_closure = {
let tenant_shard_id = *tenant_shard_id;
Expand Down Expand Up @@ -2224,6 +2237,7 @@ impl TimelineMetrics {
last_record_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
aux_file_size_gauge,
directory_entries_count_gauge,
evictions,
evictions_with_low_residence_duration: std::sync::RwLock::new(
Expand Down Expand Up @@ -2264,6 +2278,7 @@ impl TimelineMetrics {
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

self.evictions_with_low_residence_duration
.write()
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/pgdatadir_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -704,9 +704,9 @@ impl Timeline {
let v = v.context("get value")?;
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
for (fname, content) in v {
result.insert(fname, content);
sz += fname.len();
sz += content.len();
result.insert(fname, content);
}
}
self.aux_file_size_estimator.on_base_backup(sz);
Expand Down
21 changes: 12 additions & 9 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2160,6 +2160,16 @@ impl Timeline {
};

Arc::new_cyclic(|myself| {
let metrics = TimelineMetrics::new(
&tenant_shard_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
evictions_low_residence_duration_metric_threshold,
),
);
let aux_file_metrics = metrics.aux_file_size_gauge.clone();

let mut result = Timeline {
conf,
tenant_conf,
Expand Down Expand Up @@ -2191,14 +2201,7 @@ impl Timeline {
ancestor_timeline: ancestor,
ancestor_lsn: metadata.ancestor_lsn(),

metrics: TimelineMetrics::new(
&tenant_shard_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
evictions_low_residence_duration_metric_threshold,
),
),
metrics,

query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
&tenant_shard_id,
Expand Down Expand Up @@ -2263,7 +2266,7 @@ impl Timeline {
n_deltas: 0,
}),

aux_file_size_estimator: AuxFileSizeEstimator::new(),
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
Expand Down
1 change: 1 addition & 0 deletions test_runner/fixtures/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
"pageserver_storage_operations_seconds_sum_total",
"pageserver_evictions_total",
"pageserver_evictions_with_low_residence_duration_total",
"pageserver_aux_file_estimated_size",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
# "pageserver_directory_entries_count", -- only used if above a certain threshold
# "pageserver_broken_tenants_count" -- used only for broken
Expand Down

0 comments on commit 87dbd04

Please sign in to comment.