Skip to content

Commit

Permalink
pageserver: downgrade stale generation messages to INFO (#8256)
Browse files Browse the repository at this point in the history
## Problem

When generations were new, these messages were an important way of
noticing if something unexpected was going on. We found some real issues
when investigating tests that unexpectedly tripped them.

At time has gone on, this code is now pretty battle-tested, and as we do
more live migrations etc, it's fairly normal to see the occasional
message from a node with a stale generation.

At this point the cognitive load on developers to selectively allow-list
these logs outweighs the benefit of having them at warn severity.

Closes: #8080

## Summary of changes

- Downgrade "Dropped remote consistent LSN updates" and "Dropping stale
deletions" messages to INFO
- Remove all the allow-list entries for these logs.
  • Loading branch information
jcsp authored and VladLazar committed Jul 8, 2024
1 parent 0e48323 commit 7e2a3d2
Show file tree
Hide file tree
Showing 18 changed files with 2 additions and 104 deletions.
4 changes: 2 additions & 2 deletions pageserver/src/deletion_queue/validator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ where
}
} else {
// If we failed validation, then do not apply any of the projected updates
warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
}
}
Expand Down Expand Up @@ -225,7 +225,7 @@ where
&& (tenant.generation == *validated_generation);

if !this_list_valid {
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
mutated = true;
} else {
Expand Down
4 changes: 0 additions & 4 deletions test_runner/fixtures/pageserver/many_tenants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ def single_timeline(

log.info("detach template tenant form pageserver")
env.pageserver.tenant_detach(template_tenant)
env.pageserver.allowed_errors.append(
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
".*Dropped remote consistent LSN updates.*",
)

log.info(f"duplicating template tenant {ncopies} times in S3")
tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,6 @@ def setup_template(env: NeonEnv):
}
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
env.pageserver.tenant_detach(template_tenant)
env.pageserver.allowed_errors.append(
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
".*Dropped remote consistent LSN updates.*",
)
env.pageserver.tenant_attach(template_tenant, config)
ep = env.endpoints.create_start("main", tenant_id=template_tenant)
ep.safe_psql("create table foo(b text)")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):

template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
env.pageserver.tenant_detach(template_tenant)
env.pageserver.allowed_errors.append(
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
".*Dropped remote consistent LSN updates.*",
)
env.pageserver.tenant_attach(template_tenant, config)

ps_http = env.pageserver.http_client()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
}
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
env.pageserver.tenant_detach(template_tenant)
env.pageserver.allowed_errors.append(
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
".*Dropped remote consistent LSN updates.*",
)
env.pageserver.tenant_attach(template_tenant, config)
ps_http = env.pageserver.http_client()
with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
Expand Down
8 changes: 0 additions & 8 deletions test_runner/performance/test_storage_controller_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,6 @@ def test_storage_controller_many_tenants(
)

for ps in env.pageservers:
# This can happen because when we do a loop over all pageservers and mark them offline/active,
# reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
# bumping generation before other attachments are detached.
#
# We could clean this up by making reconcilers respect the .observed of their predecessor, if
# we spawn with a wait for the predecessor.
ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

# Storage controller is allowed to drop pageserver requests when the cancellation token
# for a Reconciler fires.
ps.allowed_errors.append(".*request was dropped before completing.*")
Expand Down
6 changes: 0 additions & 6 deletions test_runner/regress/test_attach_tenant_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
[
# eviction might be the first one after an attach to access the layers
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
# detach can happen before we get to validate the generation number
".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
]
)
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
Expand Down Expand Up @@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N

env.pageserver.allowed_errors.extend(
[
# This fixture detaches the tenant, and tests using it will tend to re-attach it
# shortly after. There may be un-processed deletion_queue validations from the
# initial attachment
".*Dropped remote consistent LSN updates.*",
# This fixture is for tests that will intentionally generate 400 responses
".*Error processing HTTP request: Bad request",
]
Expand Down
5 changes: 0 additions & 5 deletions test_runner/regress/test_change_pageserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
)
env = neon_env_builder.init_start()

for pageserver in env.pageservers:
# This test dual-attaches a tenant, one of the pageservers will therefore
# be running with a stale generation.
pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

env.neon_cli.create_branch("test_change_pageserver")
endpoint = env.endpoints.create_start("test_change_pageserver")

Expand Down
3 changes: 0 additions & 3 deletions test_runner/regress/test_layers_from_future.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):

env = neon_env_builder.init_configs()
env.start()
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)

ps_http = env.pageserver.http_client()

Expand Down
13 changes: 0 additions & 13 deletions test_runner/regress/test_pageserver_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

main_pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)

# Now advance the generation in the control plane: subsequent validations
# from the running pageserver will fail. No more deletions should happen.
env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
Expand Down Expand Up @@ -397,8 +393,6 @@ def assert_deletions_submitted(n: int) -> None:
# validated before restart.
assert get_deletion_queue_executed(ps_http) == before_restart_depth
else:
main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])

# If we lost the attachment, we should have dropped our pre-restart deletions.
assert get_deletion_queue_dropped(ps_http) == before_restart_depth

Expand Down Expand Up @@ -553,13 +547,6 @@ def test_multi_attach(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline

# We will intentionally create situations where stale deletions happen from non-latest-generation
# nodes when the tenant is multiply-attached
for ps in env.pageservers:
ps.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)

# Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
Expand Down
3 changes: 0 additions & 3 deletions test_runner/regress/test_pageserver_secondary.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
for ps in env.pageservers:
ps.allowed_errors.extend(
[
# We will make no effort to avoid stale attachments
".*Dropped remote consistent LSN updates.*",
".*Dropping stale deletions.*",
# page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
".*query handler.*Tenant.*not found.*",
# page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
Expand Down
7 changes: 0 additions & 7 deletions test_runner/regress/test_remote_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,13 +355,6 @@ def churn_while_failpoints_active(result):
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()

# We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before
# we later increment when actually attaching it again, leading to skipping a generation and potentially getting
# these warnings if there was a durable but un-executed deletion list at time of restart.
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)

dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
Expand Down
4 changes: 0 additions & 4 deletions test_runner/regress/test_sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,10 +1144,6 @@ def test_sharding_split_failures(
)

for ps in env.pageservers:
# When we do node failures and abandon a shard, it will de-facto have old generation and
# thereby be unable to publish remote consistent LSN updates
ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

# If we're using a failure that will panic the storage controller, all background
# upcalls from the pageserver can fail
ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
Expand Down
15 changes: 0 additions & 15 deletions test_runner/regress/test_storage_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,6 @@ def test_storage_controller_smoke(
neon_env_builder.num_pageservers = 3
env = neon_env_builder.init_configs()

for pageserver in env.pageservers:
# This test detaches tenants during migration, which can race with deletion queue operations,
# during detach we only do an advisory flush, we don't wait for it.
pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])

# Start services by hand so that we can skip a pageserver (this will start + register later)
env.broker.try_start()
env.storage_controller.start()
Expand Down Expand Up @@ -484,9 +479,6 @@ def handler(request: Request):
# Start running
env = neon_env_builder.init_start()

# We will to an unclean migration, which will result in deletion queue warnings
env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")

# Initial notification from tenant creation
assert len(notifications) == 1
expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
Expand Down Expand Up @@ -1054,13 +1046,6 @@ def tenants_placed():
online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids

for node_id in offline_node_ids:
env.get_pageserver(node_id).allowed_errors.append(
# In the case of the failpoint failure, the impacted pageserver
# still believes it has the tenant attached since location
# config calls into it will fail due to being marked offline.
".*Dropped remote consistent LSN updates.*",
)

if len(offline_node_ids) > 1:
env.get_pageserver(node_id).allowed_errors.append(
".*Scheduling error when marking pageserver.*offline.*",
Expand Down
4 changes: 0 additions & 4 deletions test_runner/regress/test_tenant_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):

assert not config_path.exists(), "detach did not remove config file"

# The re-attach's increment of the generation number may invalidate deletion queue
# updates in flight from the previous attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

env.pageserver.tenant_attach(tenant_id)
wait_until(
number_of_iterations=5,
Expand Down
12 changes: 0 additions & 12 deletions test_runner/regress/test_tenant_detach.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):

env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
Expand Down Expand Up @@ -349,10 +345,6 @@ def test_detach_while_attaching(

env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
Expand Down Expand Up @@ -422,10 +414,6 @@ def test_detach_while_activating(

env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

data_id = 1
data_secret = "very secret secret"
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
Expand Down
2 changes: 0 additions & 2 deletions test_runner/regress/test_tenant_relocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,6 @@ def test_tenant_relocation(
[
# Needed for detach polling on the original pageserver
f".*NotFound: tenant {tenant_id}.*",
# We will dual-attach in this test, so stale generations are expected
".*Dropped remote consistent LSN updates.*",
]
)

Expand Down
4 changes: 0 additions & 4 deletions test_runner/regress/test_tenants.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
# generation nubmers out of order.
env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")

# Our multiple creation requests will advance generation quickly, and when we skip
# a generation number we can generate these warnings
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+")

# Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
# an incomplete attach, or some other problem. In the field this should be rare,
# so we allow it to log at WARN, even if it is occasionally a false positive.
Expand Down

0 comments on commit 7e2a3d2

Please sign in to comment.