diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index bf06c78e673f..d215fd2b7d2d 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -190,7 +190,7 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } @@ -225,7 +225,7 @@ where && (tenant.generation == *validated_generation); if !this_list_valid { - warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); + info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64); mutated = true; } else { diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 8730d8ef751d..c437258c6f87 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -42,10 +42,6 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) log.info(f"duplicating template tenant {ncopies} times in S3") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py index 0ff9c8fdaa98..33848b06d35c 100644 --- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -55,10 +55,6 @@ def setup_template(env: NeonEnv): } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ep = env.endpoints.create_start("main", tenant_id=template_tenant) ep.safe_psql("create table foo(b text)") diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index b66db4d0ab72..b41ae601975f 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -86,10 +86,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index a8f48fe675c6..60861cf939b8 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -164,10 +164,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index d65a66b01081..3a6113706fa9 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -132,14 +132,6 @@ def test_storage_controller_many_tenants( ) for ps in env.pageservers: - # This can happen because when we do a loop over all pageservers and mark them offline/active, - # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of - # bumping generation before other attachments are detached. - # - # We could clean this up by making reconcilers respect the .observed of their predecessor, if - # we spawn with a wait for the predecessor. - ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Storage controller is allowed to drop pageserver requests when the cancellation token # for a Reconciler fires. ps.allowed_errors.append(".*request was dropped before completing.*") diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index e117c2140f5e..f2ee2b70aac6 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: [ # eviction might be the first one after an attach to access the layers ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction", - # detach can happen before we get to validate the generation number - ".*deletion backend: Dropped remote consistent LSN updates for tenant.*", ] ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) @@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N env.pageserver.allowed_errors.extend( [ - # This fixture detaches the tenant, and tests using it will tend to re-attach it - # shortly after. There may be un-processed deletion_queue validations from the - # initial attachment - ".*Dropped remote consistent LSN updates.*", # This fixture is for tests that will intentionally generate 400 responses ".*Error processing HTTP request: Bad request", ] diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 97ab69049d00..4d2cdb8e320a 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -14,11 +14,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() - for pageserver in env.pageservers: - # This test dual-attaches a tenant, one of the pageservers will therefore - # be running with a stale generation. - pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - env.neon_cli.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 54d3b2d515c5..3b2218dd9b09 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 696af24e5c0a..7ce38c5c3c82 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -249,10 +249,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 - main_pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Now advance the generation in the control plane: subsequent validations # from the running pageserver will fail. No more deletions should happen. env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id) @@ -397,8 +393,6 @@ def assert_deletions_submitted(n: int) -> None: # validated before restart. assert get_deletion_queue_executed(ps_http) == before_restart_depth else: - main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) - # If we lost the attachment, we should have dropped our pre-restart deletions. assert get_deletion_queue_dropped(ps_http) == before_restart_depth @@ -553,13 +547,6 @@ def test_multi_attach( tenant_id = env.initial_tenant timeline_id = env.initial_timeline - # We will intentionally create situations where stale deletions happen from non-latest-generation - # nodes when the tenant is multiply-attached - for ps in env.pageservers: - ps.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8431840dc069..4c828b86b053 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -83,9 +83,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): for ps in env.pageservers: ps.allowed_errors.extend( [ - # We will make no effort to avoid stale attachments - ".*Dropped remote consistent LSN updates.*", - ".*Dropping stale deletions.*", # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found ".*query handler.*Tenant.*not found.*", # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index b26bd3422f30..fac7fe9deef6 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -355,13 +355,6 @@ def churn_while_failpoints_active(result): env.pageserver.stop(immediate=True) env.endpoints.stop_all() - # We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before - # we later increment when actually attaching it again, leading to skipping a generation and potentially getting - # these warnings if there was a durable but un-executed deletion list at time of restart. - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 8267d3f36c0b..d414f986e655 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1144,10 +1144,6 @@ def test_sharding_split_failures( ) for ps in env.pageservers: - # When we do node failures and abandon a shard, it will de-facto have old generation and - # thereby be unable to publish remote consistent LSN updates - ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # If we're using a failure that will panic the storage controller, all background # upcalls from the pageserver can fail ps.allowed_errors.append(".*calling control plane generation validation API failed.*") diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index a78f566f0e4c..d37f7aae3dfd 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -60,11 +60,6 @@ def test_storage_controller_smoke( neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_configs() - for pageserver in env.pageservers: - # This test detaches tenants during migration, which can race with deletion queue operations, - # during detach we only do an advisory flush, we don't wait for it. - pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"]) - # Start services by hand so that we can skip a pageserver (this will start + register later) env.broker.try_start() env.storage_controller.start() @@ -484,9 +479,6 @@ def handler(request: Request): # Start running env = neon_env_builder.init_start() - # We will to an unclean migration, which will result in deletion queue warnings - env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*") - # Initial notification from tenant creation assert len(notifications) == 1 expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { @@ -1054,13 +1046,6 @@ def tenants_placed(): online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids for node_id in offline_node_ids: - env.get_pageserver(node_id).allowed_errors.append( - # In the case of the failpoint failure, the impacted pageserver - # still believes it has the tenant attached since location - # config calls into it will fail due to being marked offline. - ".*Dropped remote consistent LSN updates.*", - ) - if len(offline_node_ids) > 1: env.get_pageserver(node_id).allowed_errors.append( ".*Scheduling error when marking pageserver.*offline.*", diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 2cbb036c0d7c..80fb2b55b8b2 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -320,10 +320,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert not config_path.exists(), "detach did not remove config file" - # The re-attach's increment of the generation number may invalidate deletion queue - # updates in flight from the previous attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - env.pageserver.tenant_attach(tenant_id) wait_until( number_of_iterations=5, diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 2056840558e6..b165588636c7 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") @@ -349,10 +345,6 @@ def test_detach_while_attaching( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point @@ -422,10 +414,6 @@ def test_detach_while_activating( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 9fe732e28806..43e9a0d36e80 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -203,8 +203,6 @@ def test_tenant_relocation( [ # Needed for detach polling on the original pageserver f".*NotFound: tenant {tenant_id}.*", - # We will dual-attach in this test, so stale generations are expected - ".*Dropped remote consistent LSN updates.*", ] ) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 3705406c2ff9..04b3fdd80fa5 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -386,10 +386,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # generation nubmers out of order. env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") - # Our multiple creation requests will advance generation quickly, and when we skip - # a generation number we can generate these warnings - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+") - # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of # an incomplete attach, or some other problem. In the field this should be rare, # so we allow it to log at WARN, even if it is occasionally a false positive.