Skip to content

Commit

Permalink
pageserver: fix read path max lsn bug (#7007)
Browse files Browse the repository at this point in the history
## Summary of changes
The problem it fixes is when `request_lsn` is `u64::MAX-1` the
`cont_lsn` becomes `u64::MAX` which is the same as `prev_lsn` which
stops the loop.

Closes #6812
  • Loading branch information
jbajic authored Mar 12, 2024
1 parent 7ae8364 commit bac06ea
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 17 deletions.
24 changes: 20 additions & 4 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4625,10 +4625,7 @@ mod tests {
drop(guard);

// Pick a big LSN such that we query over all the changes.
// Technically, u64::MAX - 1 is the largest LSN supported by the read path,
// but there seems to be a bug on the non-vectored search path which surfaces
// in that case.
let reads_lsn = Lsn(u64::MAX - 1000);
let reads_lsn = Lsn(u64::MAX - 1);

for read in reads {
info!("Doing vectored read on {:?}", read);
Expand Down Expand Up @@ -5145,4 +5142,23 @@ mod tests {

Ok(())
}

#[tokio::test]
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_read_at_max_lsn")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
.await?;

let lsn = Lsn(0x10);
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let read_lsn = Lsn(u64::MAX - 1);

assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());

Ok(())
}
}
28 changes: 15 additions & 13 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2478,7 +2478,7 @@ impl Timeline {
// 'prev_lsn' tracks the last LSN that we were at in our search. It's used
// to check that each iteration make some progress, to break infinite
// looping if something goes wrong.
let mut prev_lsn = Lsn(u64::MAX);
let mut prev_lsn = None;

let mut result = ValueReconstructResult::Continue;
let mut cont_lsn = Lsn(request_lsn.0 + 1);
Expand All @@ -2498,18 +2498,20 @@ impl Timeline {
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
return Ok(traversal_path);
}
if prev_lsn <= cont_lsn {
// Didn't make any progress in last iteration. Error out to avoid
// getting stuck in the loop.
return Err(layer_traversal_error(format!(
"could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
key,
Lsn(cont_lsn.0 - 1),
request_lsn,
timeline.ancestor_lsn
), traversal_path));
if let Some(prev) = prev_lsn {
if prev <= cont_lsn {
// Didn't make any progress in last iteration. Error out to avoid
// getting stuck in the loop.
return Err(layer_traversal_error(format!(
"could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
key,
Lsn(cont_lsn.0 - 1),
request_lsn,
timeline.ancestor_lsn
), traversal_path));
}
}
prev_lsn = cont_lsn;
prev_lsn = Some(cont_lsn);
}
ValueReconstructResult::Missing => {
return Err(layer_traversal_error(
Expand Down Expand Up @@ -2539,7 +2541,7 @@ impl Timeline {

timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
timeline = &*timeline_owned;
prev_lsn = Lsn(u64::MAX);
prev_lsn = None;
continue 'outer;
}

Expand Down

1 comment on commit bac06ea

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2588 tests run: 2449 passed, 2 failed, 137 skipped (full report)


Failures on Postgres 16

  • test_deletion_queue_recovery[no-validate-lose]: debug

Failures on Postgres 14

  • test_bulk_insert[neon-github-actions-selfhosted]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_bulk_insert[neon-release-pg14-github-actions-selfhosted] or test_deletion_queue_recovery[debug-pg16-no-validate-lose]"
Flaky tests (2)

Postgres 16

  • test_vm_bit_clear_on_heap_lock: debug

Postgres 14

  • test_timeline_size_quota_on_startup: release

Test coverage report is not available

The comment gets automatically updated with the latest test results
bac06ea at 2024-03-12T17:17:40.471Z :recycle:

Please sign in to comment.