diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index 92979d609e49..fa6ee90b6627 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -86,6 +86,8 @@ enum Command { /// For safekeeper node_kind only, json list of timelines and their lsn info #[arg(long, default_value = None)] timeline_lsns: Option, + #[arg(long, default_value_t = false)] + verbose: bool, }, TenantSnapshot { #[arg(long = "tenant-id")] @@ -166,6 +168,7 @@ async fn main() -> anyhow::Result<()> { dump_db_connstr, dump_db_table, timeline_lsns, + verbose, } => { if let NodeKind::Safekeeper = node_kind { let db_or_list = match (timeline_lsns, dump_db_connstr) { @@ -203,6 +206,7 @@ async fn main() -> anyhow::Result<()> { tenant_ids, json, post_to_storcon, + verbose, cli.exit_code, ) .await @@ -313,6 +317,7 @@ pub async fn run_cron_job( Vec::new(), true, post_to_storcon, + false, // default to non-verbose mode exit_code, ) .await?; @@ -362,12 +367,13 @@ pub async fn scan_pageserver_metadata_cmd( tenant_shard_ids: Vec, json: bool, post_to_storcon: bool, + verbose: bool, exit_code: bool, ) -> anyhow::Result<()> { if controller_client.is_none() && post_to_storcon { return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run")); } - match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await { + match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await { Err(e) => { tracing::error!("Failed: {e}"); Err(e) diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index cb3299d413b1..c8de6e46b3de 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -21,8 +21,12 @@ pub struct MetadataSummary { tenant_count: usize, timeline_count: usize, timeline_shard_count: usize, - with_errors: HashSet, - with_warnings: HashSet, + /// Tenant-shard timeline (key) mapping to errors. The key has to be a string because it will be serialized to a JSON. + /// The key is generated using `TenantShardTimelineId::to_string()`. + with_errors: HashMap>, + /// Tenant-shard timeline (key) mapping to warnings. The key has to be a string because it will be serialized to a JSON. + /// The key is generated using `TenantShardTimelineId::to_string()`. + with_warnings: HashMap>, with_orphans: HashSet, indices_by_version: HashMap, @@ -52,7 +56,12 @@ impl MetadataSummary { } } - fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) { + fn update_analysis( + &mut self, + id: &TenantShardTimelineId, + analysis: &TimelineAnalysis, + verbose: bool, + ) { if analysis.is_healthy() { self.healthy_tenant_shards.insert(id.tenant_shard_id); } else { @@ -61,11 +70,17 @@ impl MetadataSummary { } if !analysis.errors.is_empty() { - self.with_errors.insert(*id); + let entry = self.with_errors.entry(id.to_string()).or_default(); + if verbose { + entry.extend(analysis.errors.iter().cloned()); + } } if !analysis.warnings.is_empty() { - self.with_warnings.insert(*id); + let entry = self.with_warnings.entry(id.to_string()).or_default(); + if verbose { + entry.extend(analysis.warnings.iter().cloned()); + } } } @@ -120,6 +135,7 @@ Index versions: {version_summary} pub async fn scan_pageserver_metadata( bucket_config: BucketConfig, tenant_ids: Vec, + verbose: bool, ) -> anyhow::Result { let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?; @@ -164,6 +180,7 @@ pub async fn scan_pageserver_metadata( mut tenant_objects: TenantObjectListing, timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>, highest_shard_count: ShardCount, + verbose: bool, ) { summary.tenant_count += 1; @@ -203,7 +220,7 @@ pub async fn scan_pageserver_metadata( Some(data), ) .await; - summary.update_analysis(&ttid, &analysis); + summary.update_analysis(&ttid, &analysis, verbose); timeline_ids.insert(ttid.timeline_id); } else { @@ -271,10 +288,6 @@ pub async fn scan_pageserver_metadata( summary.update_data(&data); match tenant_id { - None => { - tenant_id = Some(ttid.tenant_shard_id.tenant_id); - highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); - } Some(prev_tenant_id) => { if prev_tenant_id != ttid.tenant_shard_id.tenant_id { // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results @@ -287,6 +300,7 @@ pub async fn scan_pageserver_metadata( tenant_objects, timelines, highest_shard_count, + verbose, ) .instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id)) .await; @@ -296,6 +310,10 @@ pub async fn scan_pageserver_metadata( highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); } } + None => { + tenant_id = Some(ttid.tenant_shard_id.tenant_id); + highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count); + } } match &data.blob_data { @@ -326,6 +344,7 @@ pub async fn scan_pageserver_metadata( tenant_objects, tenant_timeline_results, highest_shard_count, + verbose, ) .instrument(info_span!("analyze-tenant", tenant = %tenant_id)) .await; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 60c4a2393609..8354432c0c06 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4556,6 +4556,7 @@ class StorageScrubber: def __init__(self, env: NeonEnv, log_dir: Path): self.env = env self.log_dir = log_dir + self.allowed_errors: list[str] = [] def scrubber_cli( self, args: list[str], timeout, extra_env: dict[str, str] | None = None @@ -4633,19 +4634,70 @@ def scan_metadata( if timeline_lsns is not None: args.append("--timeline-lsns") args.append(json.dumps(timeline_lsns)) + if node_kind == NodeKind.PAGESERVER: + args.append("--verbose") stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env) try: summary = json.loads(stdout) - # summary does not contain "with_warnings" if node_kind is the safekeeper - no_warnings = "with_warnings" not in summary or not summary["with_warnings"] - healthy = not summary["with_errors"] and no_warnings + healthy = self._check_run_healthy(summary) return healthy, summary except: log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") log.error(stdout) raise + def _check_line_allowed(self, line: str) -> bool: + for a in self.allowed_errors: + try: + if re.match(a, line): + return True + except re.error: + log.error(f"Invalid regex: '{a}'") + raise + return False + + def _check_line_list_allowed(self, lines: list[str]) -> bool: + for line in lines: + if not self._check_line_allowed(line): + return False + return True + + def _check_run_healthy(self, summary: dict[str, Any]) -> bool: + # summary does not contain "with_warnings" if node_kind is the safekeeper + healthy = True + with_warnings = summary.get("with_warnings", None) + if with_warnings is not None: + if isinstance(with_warnings, list): + if len(with_warnings) > 0: + # safekeeper scan_metadata output is a list of tenants + healthy = False + else: + for _, warnings in with_warnings.items(): + assert ( + len(warnings) > 0 + ), "with_warnings value should not be empty, running without verbose mode?" + if not self._check_line_list_allowed(warnings): + healthy = False + break + if not healthy: + return healthy + with_errors = summary.get("with_errors", None) + if with_errors is not None: + if isinstance(with_errors, list): + if len(with_errors) > 0: + # safekeeper scan_metadata output is a list of tenants + healthy = False + else: + for _, errors in with_errors.items(): + assert ( + len(errors) > 0 + ), "with_errors value should not be empty, running without verbose mode?" + if not self._check_line_list_allowed(errors): + healthy = False + break + return healthy + def tenant_snapshot(self, tenant_id: TenantId, output_path: Path): stdout = self.scrubber_cli( ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)], diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index b16dc54c248b..198e4f046074 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -572,4 +572,10 @@ def test_scrubber_scan_pageserver_metadata( unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"] assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id) - neon_env_builder.disable_scrub_on_exit() + healthy, _ = env.storage_scrubber.scan_metadata() + assert not healthy + env.storage_scrubber.allowed_errors.append(".*not present in remote storage.*") + healthy, _ = env.storage_scrubber.scan_metadata() + assert healthy + + neon_env_builder.disable_scrub_on_exit() # We already ran scrubber, no need to do an extra run