Skip to content

Commit

Permalink
Proxy: remove fail fast logic to connect to compute (#6759)
Browse files Browse the repository at this point in the history
## Problem

Flaky tests

## Summary of changes

Remove failfast logic
  • Loading branch information
khanova authored Feb 14, 2024
1 parent a2d0d44 commit c7538a2
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 55 deletions.
37 changes: 18 additions & 19 deletions proxy/src/proxy/connect_compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,25 +122,24 @@ where

error!(error = ?err, "could not connect to compute node");

let node_info =
if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
// If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
// Do not need to retrieve a new node_info, just return the old one.
if !err.should_retry(num_retries) {
return Err(err.into());
}
node_info
} else {
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
info!("compute node's state has likely changed; requesting a wake-up");
ctx.latency_timer.cache_miss();
let old_node_info = invalidate_cache(node_info);
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
node_info.reuse_settings(old_node_info);

mechanism.update_connect_config(&mut node_info.config);
node_info
};
let node_info = if !node_info.cached() {
// If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
// Do not need to retrieve a new node_info, just return the old one.
if !err.should_retry(num_retries) {
return Err(err.into());
}
node_info
} else {
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
info!("compute node's state has likely changed; requesting a wake-up");
ctx.latency_timer.cache_miss();
let old_node_info = invalidate_cache(node_info);
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
node_info.reuse_settings(old_node_info);

mechanism.update_connect_config(&mut node_info.config);
node_info
};

// now that we have a new node, try connect to it repeatedly.
// this can error for a few reasons, for instance:
Expand Down
36 changes: 0 additions & 36 deletions proxy/src/proxy/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,6 @@ enum ConnectAction {
Connect,
Retry,
Fail,
RetryPg,
FailPg,
}

#[derive(Clone)]
Expand Down Expand Up @@ -466,14 +464,6 @@ impl ConnectMechanism for TestConnectMechanism {
retryable: false,
kind: ErrorKind::Compute,
}),
ConnectAction::FailPg => Err(TestConnectError {
retryable: false,
kind: ErrorKind::Postgres,
}),
ConnectAction::RetryPg => Err(TestConnectError {
retryable: true,
kind: ErrorKind::Postgres,
}),
x => panic!("expecting action {:?}, connect is called instead", x),
}
}
Expand Down Expand Up @@ -572,32 +562,6 @@ async fn connect_to_compute_retry() {
mechanism.verify();
}

#[tokio::test]
async fn connect_to_compute_retry_pg() {
let _ = env_logger::try_init();
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
let user_info = helper_create_connect_info(&mechanism);
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
.await
.unwrap();
mechanism.verify();
}

#[tokio::test]
async fn connect_to_compute_fail_pg() {
let _ = env_logger::try_init();
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
let user_info = helper_create_connect_info(&mechanism);
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
.await
.unwrap_err();
mechanism.verify();
}

/// Test that we don't retry if the error is not retryable.
#[tokio::test]
async fn connect_to_compute_non_retry_1() {
Expand Down

1 comment on commit c7538a2

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2511 tests run: 2382 passed, 2 failed, 127 skipped (full report)


Failures on Postgres 14

  • test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]: release
  • test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30] or test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]"
Flaky tests (5)

Postgres 16

  • test_vm_bit_clear_on_heap_lock: debug

Postgres 14

  • test_create_snapshot: release, debug
  • test_partial_evict_tenant[relative_spare]: release
  • test_timeline_size_quota_on_startup: release

Code coverage (full report)

  • functions: 56.0% (12854 of 22970 functions)
  • lines: 82.6% (69457 of 84138 lines)

The comment gets automatically updated with the latest test results
c7538a2 at 2024-02-14T19:39:03.606Z :recycle:

Please sign in to comment.