Skip to content

Commit

Permalink
scrubber: clean up scan_metadata before prod (#8565)
Browse files Browse the repository at this point in the history
Part of #8128.

## Problem
Currently, scrubber `scan_metadata` command will return with an error
code if the metadata on remote storage is corrupted with fatal errors.
To safely deploy this command in a cronjob, we want to differentiate
between failures while running scrubber command and the erroneous
metadata. At the same time, we also want our regression tests to catch
corrupted metadata using the scrubber command.

## Summary of changes

- Return with error code only when the scrubber command fails
- Uses explicit checks on errors and warnings to determine metadata
health in regression tests.

**Resolve conflict with `tenant-snapshot` command (after shard split):**
[`test_scrubber_tenant_snapshot`](https://github.com/neondatabase/neon/blob/yuchen/scrubber-scan-cleanup-before-prod/test_runner/regress/test_storage_scrubber.py#L23)
failed before applying 422a844
- When taking a snapshot, the old `index_part.json` in the unsharded
tenant directory is not kept.
- The current `list_timeline_blobs` implementation consider no
`index_part.json` as a parse error.
- During the scan, we are only analyzing shards with highest shard
count, so we will not get a parse error. but we do need to add the
layers to tenant object listing, otherwise we will get index is
referencing a layer that is not in remote storage error.
- **Action:** Add s3_layers from `list_timeline_blobs` regardless of
parsing error

Signed-off-by: Yuchen Liang <[email protected]>
  • Loading branch information
yliang412 authored Aug 6, 2024
1 parent ca5390a commit ed5724d
Show file tree
Hide file tree
Showing 12 changed files with 70 additions and 41 deletions.
14 changes: 10 additions & 4 deletions storage_scrubber/src/checks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors(
}
}
BlobDataParseResult::Relic => {}
BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
parse_errors
BlobDataParseResult::Incorrect {
errors,
s3_layers: _,
} => result.errors.extend(
errors
.into_iter()
.map(|error| format!("parse error: {error}")),
),
Expand Down Expand Up @@ -300,7 +303,10 @@ pub(crate) enum BlobDataParseResult {
},
/// The remains of a deleted Timeline (i.e. an initdb archive only)
Relic,
Incorrect(Vec<String>),
Incorrect {
errors: Vec<String>,
s3_layers: HashSet<(LayerName, Generation)>,
},
}

pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
Expand Down Expand Up @@ -443,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
}

Ok(S3TimelineBlobData {
blob_data: BlobDataParseResult::Incorrect(errors),
blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
unused_index_keys: index_part_keys,
unknown_keys,
})
Expand Down
10 changes: 5 additions & 5 deletions storage_scrubber/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,21 +208,21 @@ async fn main() -> anyhow::Result<()> {
}

if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
tracing::error!("Fatal scrub errors detected");
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
tracing::error!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
);
}

Ok(())
}
}
}
Expand Down
14 changes: 10 additions & 4 deletions storage_scrubber/src/pageserver_physical_gc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,13 @@ async fn gc_ancestor(
// Post-deletion tenant location: don't try and GC it.
continue;
}
BlobDataParseResult::Incorrect(reasons) => {
BlobDataParseResult::Incorrect {
errors,
s3_layers: _, // TODO(yuchen): could still check references to these s3 layers?
} => {
// Our primary purpose isn't to report on bad data, but log this rather than skipping silently
tracing::warn!(
"Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
"Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}"
);
continue;
}
Expand Down Expand Up @@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc(
// Post-deletion tenant location: don't try and GC it.
return Ok(summary);
}
BlobDataParseResult::Incorrect(reasons) => {
BlobDataParseResult::Incorrect {
errors,
s3_layers: _,
} => {
// Our primary purpose isn't to report on bad data, but log this rather than skipping silently
tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
return Ok(summary);
}
};
Expand Down
22 changes: 15 additions & 7 deletions storage_scrubber/src/scan_pageserver_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,21 @@ pub async fn scan_metadata(
}
}

if let BlobDataParseResult::Parsed {
index_part: _index_part,
index_part_generation: _index_part_generation,
s3_layers,
} = &data.blob_data
{
tenant_objects.push(ttid, s3_layers.clone());
match &data.blob_data {
BlobDataParseResult::Parsed {
index_part: _index_part,
index_part_generation: _index_part_generation,
s3_layers,
} => {
tenant_objects.push(ttid, s3_layers.clone());
}
BlobDataParseResult::Relic => (),
BlobDataParseResult::Incorrect {
errors: _,
s3_layers,
} => {
tenant_objects.push(ttid, s3_layers.clone());
}
}
tenant_timeline_results.push((ttid, data));
}
Expand Down
2 changes: 1 addition & 1 deletion storage_scrubber/src/tenant_snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ impl SnapshotDownloader {
.context("Downloading timeline")?;
}
BlobDataParseResult::Relic => {}
BlobDataParseResult::Incorrect(_) => {
BlobDataParseResult::Incorrect { .. } => {
tracing::error!("Bad metadata in timeline {ttid}");
}
};
Expand Down
14 changes: 11 additions & 3 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,10 @@ def __exit__(
and self.enable_scrub_on_exit
):
try:
self.env.storage_scrubber.scan_metadata()
healthy, _ = self.env.storage_scrubber.scan_metadata()
if not healthy:
e = Exception("Remote storage metadata corrupted")
cleanup_error = e
except Exception as e:
log.error(f"Error during remote storage scrub: {e}")
cleanup_error = e
Expand Down Expand Up @@ -4411,14 +4414,19 @@ def scrubber_cli(self, args: list[str], timeout) -> str:
assert stdout is not None
return stdout

def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]:
"""
Returns the health status and the metadata summary.
"""
args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
if post_to_storage_controller:
args.append("--post")
stdout = self.scrubber_cli(args, timeout=30)

try:
return json.loads(stdout)
summary = json.loads(stdout)
healthy = not summary["with_errors"] and not summary["with_warnings"]
return healthy, summary
except:
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
log.error(stdout)
Expand Down
5 changes: 2 additions & 3 deletions test_runner/regress/test_compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,11 +496,10 @@ def test_historic_storage_formats(
# Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
#
# Do this _before_ importing to the pageserver, as that import may start writing immediately
metadata_summary = env.storage_scrubber.scan_metadata()
healthy, metadata_summary = env.storage_scrubber.scan_metadata()
assert healthy
assert metadata_summary["tenant_count"] >= 1
assert metadata_summary["timeline_count"] >= 1
assert not metadata_summary["with_errors"]
assert not metadata_summary["with_warnings"]

env.neon_cli.import_tenant(dataset.tenant_id)

Expand Down
5 changes: 2 additions & 3 deletions test_runner/regress/test_pageserver_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,12 +214,11 @@ def parse_generation_suffix(key):

# Having written a mixture of generation-aware and legacy index_part.json,
# ensure the scrubber handles the situation as expected.
metadata_summary = env.storage_scrubber.scan_metadata()
healthy, metadata_summary = env.storage_scrubber.scan_metadata()
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
assert metadata_summary["timeline_count"] == 1
assert metadata_summary["timeline_shard_count"] == 1
assert not metadata_summary["with_errors"]
assert not metadata_summary["with_warnings"]
assert healthy


def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
Expand Down
3 changes: 2 additions & 1 deletion test_runner/regress/test_pageserver_secondary.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
# Scrub the remote storage
# ========================
# This confirms that the scrubber isn't upset by the presence of the heatmap
env.storage_scrubber.scan_metadata()
healthy, _ = env.storage_scrubber.scan_metadata()
assert healthy

# Detach secondary and delete tenant
# ===================================
Expand Down
3 changes: 2 additions & 1 deletion test_runner/regress/test_sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def get_sizes():

# Check the scrubber isn't confused by sharded content, then disable
# it during teardown because we'll have deleted by then
env.storage_scrubber.scan_metadata()
healthy, _ = env.storage_scrubber.scan_metadata()
assert healthy

env.storage_controller.pageserver_api().tenant_delete(tenant_id)
assert_prefix_empty(
Expand Down
11 changes: 6 additions & 5 deletions test_runner/regress/test_storage_scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,9 +516,8 @@ def test_scrubber_scan_pageserver_metadata(
assert len(index.layer_metadata) > 0
it = iter(index.layer_metadata.items())

scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
assert not scan_summary["with_warnings"]
assert not scan_summary["with_errors"]
healthy, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
assert healthy

assert env.storage_controller.metadata_health_is_healthy()

Expand All @@ -532,16 +531,18 @@ def test_scrubber_scan_pageserver_metadata(
log.info(f"delete response: {delete_response}")

# Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
scan_summary = env.storage_scrubber.scan_metadata()
_, scan_summary = env.storage_scrubber.scan_metadata()
log.info(f"{pprint.pformat(scan_summary)}")
assert len(scan_summary["with_warnings"]) > 0

assert env.storage_controller.metadata_health_is_healthy()

# Now post to storage controller, expect seeing one unhealthy health record
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
_, scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
log.info(f"{pprint.pformat(scan_summary)}")
assert len(scan_summary["with_warnings"]) > 0

unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)

neon_env_builder.disable_scrub_on_exit()
8 changes: 4 additions & 4 deletions test_runner/regress/test_tenant_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,13 +341,13 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn)
env.stop()

result = env.storage_scrubber.scan_metadata()
assert result["with_warnings"] == []
healthy, _ = env.storage_scrubber.scan_metadata()
assert healthy

env.start()
ps_http = env.pageserver.http_client()
ps_http.tenant_delete(tenant_id)
env.stop()

env.storage_scrubber.scan_metadata()
assert result["with_warnings"] == []
healthy, _ = env.storage_scrubber.scan_metadata()
assert healthy

1 comment on commit ed5724d

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2200 tests run: 2119 passed, 0 failed, 81 skipped (full report)


Code coverage* (full report)

  • functions: 32.8% (7153 of 21803 functions)
  • lines: 50.5% (57744 of 114305 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
ed5724d at 2024-08-06T19:48:55.775Z :recycle:

Please sign in to comment.