From 1b403cb4f97783f5e48e4067f370f3b807b633be Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 15 Sep 2023 20:52:43 +0000 Subject: [PATCH 1/7] test: disable compaction on timeline_checkpoint --- test_runner/regress/test_ondemand_download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 4041b8aab70b..db37df0afda4 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -323,8 +323,8 @@ def test_download_remote_layers_api( "compaction_period": "0s", # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB - "compaction_threshold": "1", - "image_creation_threshold": "1", + "compaction_threshold": "999999", + "image_creation_threshold": "999999", "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB } ) From e6925a94a187ae24fc4172483502741beda6eba8 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 15 Sep 2023 20:53:47 +0000 Subject: [PATCH 2/7] test: shutdown safekeepers earlier then compact, upload --- test_runner/regress/test_ondemand_download.py | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index db37df0afda4..87892123f635 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -357,6 +357,21 @@ def get_resident_physical_size(): tenant_id, timeline_id, "pageserver_resident_physical_size" ) + # Shut down safekeepers before starting the pageserver. + # If we don't, the tenant's walreceiver handler will trigger the + # the logical size computation task, and that downloads layes, + # which makes our assertions on size fail. + for sk in env.safekeepers: + sk.stop() + # sk.stop(immediate=True) + + # it is sad we cannot do a flush inmem layer without compaction, but + # working around with very high layer0 count and image layer creation + # threshold + client.timeline_checkpoint(tenant_id, timeline_id) + + wait_for_upload_queue_empty(client, tenant_id, timeline_id) + filled_current_physical = get_api_current_physical_size() log.info(filled_current_physical) filled_size = get_resident_physical_size() @@ -371,13 +386,6 @@ def get_resident_physical_size(): log.info(f"unlinking layer {layer.name}") layer.unlink() - # Shut down safekeepers before starting the pageserver. - # If we don't, the tenant's walreceiver handler will trigger the - # the logical size computation task, and that downloads layes, - # which makes our assertions on size fail. - for sk in env.safekeepers: - sk.stop(immediate=True) - ##### Second start, restore the data and ensure it's the same env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"}) env.pageserver.allowed_errors.extend( From dd2889f2e2ba5b338adb55d673dc05bdb3009287 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 15 Sep 2023 20:54:21 +0000 Subject: [PATCH 3/7] test: less naked log infos --- test_runner/regress/test_ondemand_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 87892123f635..47da07786a35 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -373,7 +373,7 @@ def get_resident_physical_size(): wait_for_upload_queue_empty(client, tenant_id, timeline_id) filled_current_physical = get_api_current_physical_size() - log.info(filled_current_physical) + log.info(f"filled_current_physical: {filled_current_physical}") filled_size = get_resident_physical_size() log.info(f"filled_size: {filled_size}") assert filled_current_physical == filled_size, "we don't yet do layer eviction" From a9143249974704138a2196bb12731aafdc85587d Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 15 Sep 2023 20:57:05 +0000 Subject: [PATCH 4/7] revert changes from a55a78a4535 --- test_runner/regress/test_ondemand_download.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 47da07786a35..c2036e31e653 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -403,13 +403,8 @@ def get_resident_physical_size(): # witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test. this_time = get_api_current_physical_size() assert ( - filled_current_physical <= this_time + filled_current_physical == this_time ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" - if filled_current_physical != this_time: - log.info( - f"fixing up filled_current_physical from {filled_current_physical} to {this_time} ({this_time - filled_current_physical})" - ) - filled_current_physical = this_time post_unlink_size = get_resident_physical_size() log.info(f"post_unlink_size: {post_unlink_size}") From 6e132d71b15c32aba2738b4b447be9026b7d2453 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 15 Sep 2023 21:52:01 +0000 Subject: [PATCH 5/7] temp: always fail on test_download_remote_layers_api --- test_runner/regress/test_ondemand_download.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index c2036e31e653..493097dc806c 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -475,6 +475,8 @@ def get_resident_physical_size(): with endpoint_old.cursor() as cur: assert query_scalar(cur, "select count(*) from testtab") == table_len + raise RuntimeError("made it to the end of test_download_remote_layers_api :tada:") + @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) def test_compaction_downloads_on_demand_without_image_creation( From 260e1b9f37a6ca8c4b099a52e684ec005eff5be1 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sat, 16 Sep 2023 10:29:21 +0000 Subject: [PATCH 6/7] Revert "temp: always fail on test_download_remote_layers_api" This reverts commit 6e132d71b15c32aba2738b4b447be9026b7d2453. --- test_runner/regress/test_ondemand_download.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 493097dc806c..c2036e31e653 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -475,8 +475,6 @@ def get_resident_physical_size(): with endpoint_old.cursor() as cur: assert query_scalar(cur, "select count(*) from testtab") == table_len - raise RuntimeError("made it to the end of test_download_remote_layers_api :tada:") - @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) def test_compaction_downloads_on_demand_without_image_creation( From e249c8a9ad98b78fbb0c77d4706a78a3a3e6a414 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sat, 16 Sep 2023 14:00:02 +0000 Subject: [PATCH 7/7] cleanup: test_download_remote_layers_api --- test_runner/regress/test_ondemand_download.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index c2036e31e653..a38a51710036 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -358,12 +358,9 @@ def get_resident_physical_size(): ) # Shut down safekeepers before starting the pageserver. - # If we don't, the tenant's walreceiver handler will trigger the - # the logical size computation task, and that downloads layes, - # which makes our assertions on size fail. + # If we don't, they might stream us more WAL. for sk in env.safekeepers: sk.stop() - # sk.stop(immediate=True) # it is sad we cannot do a flush inmem layer without compaction, but # working around with very high layer0 count and image layer creation @@ -381,7 +378,6 @@ def get_resident_physical_size(): env.pageserver.stop() # remove all the layer files - # XXX only delete some of the layer files, to show that it really just downloads all the layers for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"): log.info(f"unlinking layer {layer.name}") layer.unlink() @@ -399,8 +395,6 @@ def get_resident_physical_size(): ###### Phase 1: exercise download error code path - # comparison here is requiring the size to be at least the previous size, because it's possible received WAL after last_flush_lsn_upload - # witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test. this_time = get_api_current_physical_size() assert ( filled_current_physical == this_time @@ -411,15 +405,11 @@ def get_resident_physical_size(): assert ( post_unlink_size < filled_size ), "we just deleted layers and didn't cause anything to re-download them yet" - assert filled_size - post_unlink_size > 5 * ( - 1024**2 - ), "we may be downloading some layers as part of tenant activation" # issue downloads that we know will fail info = client.timeline_download_remote_layers( tenant_id, timeline_id, - # allow some concurrency to unveil potential concurrency bugs max_concurrent_downloads=10, errors_ok=True, at_least_one_download=False, @@ -428,9 +418,9 @@ def get_resident_physical_size(): assert info["state"] == "Completed" assert info["total_layer_count"] > 0 assert info["successful_download_count"] == 0 - assert ( - info["failed_download_count"] > 0 - ) # can't assert == total_layer_count because attach + tenant status downloads some layers + # can't assert == total_layer_count because timeline_detail also tries to + # download layers for logical size, but this might not always hold. + assert info["failed_download_count"] > 0 assert ( info["total_layer_count"] == info["successful_download_count"] + info["failed_download_count"] @@ -439,7 +429,6 @@ def get_resident_physical_size(): assert ( get_resident_physical_size() == post_unlink_size ), "didn't download anything new due to failpoint" - # would be nice to assert that the layers in the layer map are still RemoteLayer ##### Retry, this time without failpoints client.configure_failpoints(("remote-storage-download-pre-rename", "off"))