From 587e382d7b6113718d5211ccbd62d154c79fb6ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 13 May 2024 23:36:10 +0200 Subject: [PATCH 01/11] Move generate_uploads_and_deletions to neon_fixtures.py --- test_runner/fixtures/neon_fixtures.py | 78 +++++++++++++++++++ .../regress/test_pageserver_generations.py | 77 +----------------- 2 files changed, 79 insertions(+), 76 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 84326553704d..bbfd2706d67f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -59,6 +59,7 @@ from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, + wait_for_upload_queue_empty, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor @@ -79,6 +80,7 @@ allure_attach_from_dir, assert_no_errors, get_self_dir, + print_gc_result, subprocess_capture, wait_until, ) @@ -4401,3 +4403,79 @@ def parse_project_git_version_output(s: str) -> str: return commit raise ValueError(f"unable to parse --version output: '{s}'") + + +def generate_uploads_and_deletions( + env: NeonEnv, + *, + init: bool = True, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + data: Optional[str] = None, + pageserver: NeonPageserver, +): + """ + Using the environment's default tenant + timeline, generate a load pattern + that results in some uploads and some deletions to remote storage. + """ + + if tenant_id is None: + tenant_id = env.initial_tenant + assert tenant_id is not None + + if timeline_id is None: + timeline_id = env.initial_timeline + assert timeline_id is not None + + ps_http = pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + if init: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + def churn(data): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 200) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + assert tenant_id is not None + assert timeline_id is not None + # We are waiting for uploads as well as local flush, in order to avoid leaving the system + # in a state where there are "future layers" in remote storage that will generate deletions + # after a restart. + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + # Compaction should generate some GC-elegible layers + for i in range(0, 2): + churn(f"{i if data is None else data}") + + gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + # Stop endpoint and flush all data to pageserver, then checkpoint it: this + # ensures that the pageserver is in a fully idle state: there will be no more + # background ingest, no more uploads pending, and therefore no non-determinism + # in subsequent actions like pageserver restarts. + final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + # Finish uploads + wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) + # Finish all remote writes (including deletions) + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index a38bcd45da2b..fc231f0592be 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -25,6 +25,7 @@ PgBin, S3Scrubber, flush_ep_to_pageserver, + generate_uploads_and_deletions, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException @@ -59,82 +60,6 @@ } -def generate_uploads_and_deletions( - env: NeonEnv, - *, - init: bool = True, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - data: Optional[str] = None, - pageserver: NeonPageserver, -): - """ - Using the environment's default tenant + timeline, generate a load pattern - that results in some uploads and some deletions to remote storage. - """ - - if tenant_id is None: - tenant_id = env.initial_tenant - assert tenant_id is not None - - if timeline_id is None: - timeline_id = env.initial_timeline - assert timeline_id is not None - - ps_http = pageserver.http_client() - - with env.endpoints.create_start( - "main", tenant_id=tenant_id, pageserver_id=pageserver.id - ) as endpoint: - if init: - endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") - last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id - ) - - def churn(data): - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 200) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) - assert tenant_id is not None - assert timeline_id is not None - # We are waiting for uploads as well as local flush, in order to avoid leaving the system - # in a state where there are "future layers" in remote storage that will generate deletions - # after a restart. - last_flush_lsn_upload( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id - ) - - # Compaction should generate some GC-elegible layers - for i in range(0, 2): - churn(f"{i if data is None else data}") - - gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) - print_gc_result(gc_result) - assert gc_result["layers_removed"] > 0 - - # Stop endpoint and flush all data to pageserver, then checkpoint it: this - # ensures that the pageserver is in a fully idle state: there will be no more - # background ingest, no more uploads pending, and therefore no non-determinism - # in subsequent actions like pageserver restarts. - final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) - ps_http.timeline_checkpoint(tenant_id, timeline_id) - # Finish uploads - wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) - # Finish all remote writes (including deletions) - wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) - - def read_all( env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None ): From 970dc8991561a67590c338bf69f40f3254b5a191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 13 May 2024 23:48:09 +0200 Subject: [PATCH 02/11] Add test_uploads_and_deletions test --- test_runner/regress/test_compaction.py | 46 +++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 43a332346226..6c1e70ceb0bc 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -1,10 +1,11 @@ +import enum import json import os from typing import Optional import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions from fixtures.workload import Workload AGGRESIVE_COMPACTION_TENANT_CONF = { @@ -190,3 +191,46 @@ def test_sharding_compaction( # Assert that everything is still readable workload.validate() + + +class CompactionAlgorithm(str, enum.Enum): + LEGACY = "Legacy" + TIERED = "Tiered" + + +@pytest.mark.parametrize( + "compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED] +) +def test_uploads_and_deletions( + neon_env_builder: NeonEnvBuilder, + compaction_algorithm: CompactionAlgorithm, +): + """ + :param compaction_algorithm: the compaction algorithm to use. + """ + neon_env_builder.num_pageservers = 1 + + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + "compaction_algorithm": f'{{"kind": "{compaction_algorithm.value}"}}', + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + # TODO remove these allowed errors + # https://github.com/neondatabase/neon/issues/7707 + env.pageserver.allowed_errors.extend( + [".*duplicated L1 layer.*", ".*delta layer created with.*duplicate values.*"] + ) + + generate_uploads_and_deletions(env, pageserver=env.pageserver) From c6a20eb31b40b344c74d1b987d9d1e21ef614e0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 14 May 2024 18:08:22 +0200 Subject: [PATCH 03/11] Review comments --- test_runner/regress/test_compaction.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 6c1e70ceb0bc..012a9360c85e 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -208,7 +208,6 @@ def test_uploads_and_deletions( """ :param compaction_algorithm: the compaction algorithm to use. """ - neon_env_builder.num_pageservers = 1 tenant_conf = { # small checkpointing and compaction targets to ensure we generate many upload operations @@ -223,7 +222,7 @@ def test_uploads_and_deletions( # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", "image_layer_creation_check_threshold": "0", - "compaction_algorithm": f'{{"kind": "{compaction_algorithm.value}"}}', + "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}), } env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) @@ -234,3 +233,5 @@ def test_uploads_and_deletions( ) generate_uploads_and_deletions(env, pageserver=env.pageserver) + + env.pageserver.assert_log_contains("duplicated L1 layer") From f5611239b0b31dfde0804ac885f56f675fc09961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 14 May 2024 18:20:49 +0200 Subject: [PATCH 04/11] ruff check --fix --- test_runner/regress/test_pageserver_generations.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index fc231f0592be..f94a1d4b18e0 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -21,12 +21,9 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - NeonPageserver, PgBin, S3Scrubber, - flush_ep_to_pageserver, generate_uploads_and_deletions, - last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( @@ -34,12 +31,11 @@ list_prefix, wait_for_last_record_lsn, wait_for_upload, - wait_for_upload_queue_empty, ) from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.utils import print_gc_result, wait_until +from fixtures.utils import wait_until from fixtures.workload import Workload # A tenant configuration that is convenient for generating uploads and deletions From d69287cfd162b3849ae7bb0b0c0963a190bcf8d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 14 May 2024 18:22:42 +0200 Subject: [PATCH 05/11] Allow assertion failed error as well --- test_runner/regress/test_compaction.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 012a9360c85e..7c3bfee67f41 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -229,7 +229,11 @@ def test_uploads_and_deletions( # TODO remove these allowed errors # https://github.com/neondatabase/neon/issues/7707 env.pageserver.allowed_errors.extend( - [".*duplicated L1 layer.*", ".*delta layer created with.*duplicate values.*"] + [ + ".*duplicated L1 layer.*", + ".*delta layer created with.*duplicate values.*", + ".*assertion failed: self.lsn_range.start <= lsn.*", + ] ) generate_uploads_and_deletions(env, pageserver=env.pageserver) From ba14a1472247ddd9d8fe295477df1ed54d5f86e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 14 May 2024 18:42:04 +0200 Subject: [PATCH 06/11] Add comments --- test_runner/regress/test_compaction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 7c3bfee67f41..4a759cb8f726 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -228,6 +228,7 @@ def test_uploads_and_deletions( # TODO remove these allowed errors # https://github.com/neondatabase/neon/issues/7707 + # https://github.com/neondatabase/neon/issues/7759 env.pageserver.allowed_errors.extend( [ ".*duplicated L1 layer.*", @@ -238,4 +239,5 @@ def test_uploads_and_deletions( generate_uploads_and_deletions(env, pageserver=env.pageserver) - env.pageserver.assert_log_contains("duplicated L1 layer") + # We would like to assert for "duplicated L1 layer" here, but there is other errors that + # might occur flakily. Therefore, don't assert anything. From 20046c0f870bcd9f2d58d2fb8c875557136ae49c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 15 May 2024 02:00:45 +0200 Subject: [PATCH 07/11] Allow this error log as well --- test_runner/regress/test_compaction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 4a759cb8f726..769670f113ff 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -234,6 +234,7 @@ def test_uploads_and_deletions( ".*duplicated L1 layer.*", ".*delta layer created with.*duplicate values.*", ".*assertion failed: self.lsn_range.start <= lsn.*", + ".*HTTP request handler task panicked: task.*panicked.*", ] ) From cb57e308f28de051aaf12fa2dd3514160b678341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 15 May 2024 13:10:26 +0200 Subject: [PATCH 08/11] surround generate_uploads_and_deletions with try If the "assertion failed: self.lsn_range.start" assertion error occurs, it will lead to a failing HTTP request, which yields a panic in the pageserver, and a PageserverApiException in the python test. --- test_runner/regress/test_compaction.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 769670f113ff..0bcb1306c8b0 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -7,6 +7,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions from fixtures.workload import Workload +from fixtures.pageserver.http import PageserverApiException AGGRESIVE_COMPACTION_TENANT_CONF = { # Disable gc and compaction. The test runs compaction manually. @@ -238,7 +239,10 @@ def test_uploads_and_deletions( ] ) - generate_uploads_and_deletions(env, pageserver=env.pageserver) + try: + generate_uploads_and_deletions(env, pageserver=env.pageserver) + except PageserverApiException as e: + log.info(f"Obtained PageserverApiException: {e}") # We would like to assert for "duplicated L1 layer" here, but there is other errors that # might occur flakily. Therefore, don't assert anything. From ea88d28ab39102135664aa4793f08e0e51b9e0e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 15 May 2024 13:18:28 +0200 Subject: [PATCH 09/11] Require one of the allowed errors --- test_runner/regress/test_compaction.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 0bcb1306c8b0..c121c1de3c88 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -230,19 +230,20 @@ def test_uploads_and_deletions( # TODO remove these allowed errors # https://github.com/neondatabase/neon/issues/7707 # https://github.com/neondatabase/neon/issues/7759 - env.pageserver.allowed_errors.extend( - [ - ".*duplicated L1 layer.*", - ".*delta layer created with.*duplicate values.*", - ".*assertion failed: self.lsn_range.start <= lsn.*", - ".*HTTP request handler task panicked: task.*panicked.*", - ] - ) + allowed_errors = [ + ".*duplicated L1 layer.*", + ".*delta layer created with.*duplicate values.*", + ".*assertion failed: self.lsn_range.start <= lsn.*", + ".*HTTP request handler task panicked: task.*panicked.*", + ] + env.pageserver.allowed_errors.extend(allowed_errors) try: generate_uploads_and_deletions(env, pageserver=env.pageserver) except PageserverApiException as e: log.info(f"Obtained PageserverApiException: {e}") - # We would like to assert for "duplicated L1 layer" here, but there is other errors that - # might occur flakily. Therefore, don't assert anything. + # The errors occur flakily and no error is ensured to occur, + # however at least one of them occurs. + if not any(env.pageserver.log_contains(e) for e in allowed_errors): + raise Exception("None of the allowed_errors occured in the log") From 0ba65f3134936e67d389beaeceb2b2291922ae7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 15 May 2024 13:18:59 +0200 Subject: [PATCH 10/11] fix order --- test_runner/regress/test_compaction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index c121c1de3c88..d3c28f08cbb6 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -6,8 +6,8 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions -from fixtures.workload import Workload from fixtures.pageserver.http import PageserverApiException +from fixtures.workload import Workload AGGRESIVE_COMPACTION_TENANT_CONF = { # Disable gc and compaction. The test runs compaction manually. From 568e2a2f348b371db2aacfbd4787df7c8cf78c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 15 May 2024 14:10:33 +0200 Subject: [PATCH 11/11] This was only meant for tiered compaction mode --- test_runner/regress/test_compaction.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index d3c28f08cbb6..93a16620a348 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -236,7 +236,8 @@ def test_uploads_and_deletions( ".*assertion failed: self.lsn_range.start <= lsn.*", ".*HTTP request handler task panicked: task.*panicked.*", ] - env.pageserver.allowed_errors.extend(allowed_errors) + if compaction_algorithm == CompactionAlgorithm.TIERED: + env.pageserver.allowed_errors.extend(allowed_errors) try: generate_uploads_and_deletions(env, pageserver=env.pageserver) @@ -245,5 +246,7 @@ def test_uploads_and_deletions( # The errors occur flakily and no error is ensured to occur, # however at least one of them occurs. - if not any(env.pageserver.log_contains(e) for e in allowed_errors): - raise Exception("None of the allowed_errors occured in the log") + if compaction_algorithm == CompactionAlgorithm.TIERED: + found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors) + if not found_allowed_error: + raise Exception("None of the allowed_errors occured in the log")