From e71d20d3928dd03e1159ec5a08cfa902cf85cd31 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 10 Dec 2024 20:42:52 +0100 Subject: [PATCH] Emit nbtree vacuum cycle id in nbtree xlog through forced FPIs (#9932) This fixes neondatabase/neon#9929. ## Postgres repo PRS: - PG17: https://github.com/neondatabase/postgres/pull/538 - PG16: https://github.com/neondatabase/postgres/pull/539 - PG15: https://github.com/neondatabase/postgres/pull/540 - PG14: https://github.com/neondatabase/postgres/pull/541 ## Problem see #9929 ## Summary of changes We update the split code to force the code to emit an FPI whenever the cycle ID might be interesting for concurrent btree vacuum. --- .../regress/test_nbtree_pagesplit_cycleid.py | 124 ++++++++++++++++++ vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 +- 6 files changed, 132 insertions(+), 8 deletions(-) create mode 100644 test_runner/regress/test_nbtree_pagesplit_cycleid.py diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py new file mode 100644 index 000000000000..558557aebad6 --- /dev/null +++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py @@ -0,0 +1,124 @@ +import threading +import time + +from fixtures.neon_fixtures import NeonEnv + +BTREE_NUM_CYCLEID_PAGES = """ + WITH raw_pages AS ( + SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page + FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno + ), + parsed_pages AS ( + /* cycle ID is the last 2 bytes of the btree page */ + SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id + FROM raw_pages + ) + SELECT count(*), + encode(cycle_id, 'hex') + FROM parsed_pages + WHERE encode(cycle_id, 'hex') != '0000' + GROUP BY encode(cycle_id, 'hex'); + """ + + +def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): + env = neon_simple_env + endpoint = env.endpoints.create_start("main") + + ses1 = endpoint.connect().cursor() + ses1.execute("ALTER SYSTEM SET autovacuum = off;") + ses1.execute("ALTER SYSTEM SET enable_seqscan = off;") + ses1.execute("ALTER SYSTEM SET full_page_writes = off;") + ses1.execute("SELECT pg_reload_conf();") + ses1.execute("CREATE EXTENSION neon_test_utils;") + # prepare a large index + ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);") + ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);") + ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;") + + ses1.execute("SELECT neon_xlogflush();") + ses1.execute(BTREE_NUM_CYCLEID_PAGES) + pages = ses1.fetchall() + assert ( + len(pages) == 0 + ), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}" + # Delete enough tuples to clear the first index page. + # (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs. + ses1.execute("DELETE FROM t WHERE id <= 406;") + # Make sure the page is cleaned up + ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;") + + # Do another delete-then-indexcleanup cycle, to move the pages from + # "dead" to "reusable" + ses1.execute("DELETE FROM t WHERE id <= 446;") + ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;") + + # Make sure the vacuum we're about to trigger in s3 has cleanup work to do + ses1.execute("DELETE FROM t WHERE id <= 610;") + + # Flush wal, for checking purposes + ses1.execute("SELECT neon_xlogflush();") + ses1.execute(BTREE_NUM_CYCLEID_PAGES) + pages = ses1.fetchall() + assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead" + + ses2 = endpoint.connect().cursor() + ses3 = endpoint.connect().cursor() + + # Session 2 pins a btree page, which prevents vacuum from processing that + # page, thus allowing us to reliably split pages while a concurrent vacuum + # is running. + ses2.execute("BEGIN;") + ses2.execute( + "DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC" + ) + ses2.execute("FETCH FROM foo;") # pins the leaf page with id 611 + wait_evt = threading.Event() + + # Session 3 runs the VACUUM command. Note that this will block, and + # therefore must run on another thread. + # We rely on this running quickly enough to hit the pinned page from + # session 2 by the time we start other work again in session 1, but + # technically there is a race where the thread (and/or PostgreSQL process) + # don't get to that pinned page with vacuum until >2s after evt.set() was + # called, and session 1 thus might already have split pages. + def vacuum_freeze_t(ses3, evt: threading.Event): + # Begin parallel vacuum that should hit the index + evt.set() + # this'll hang until s2 fetches enough new data from its cursor. + # this is technically a race with the time.sleep(2) below, but if this + # command doesn't hit + ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;") + + ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt)) + ses3t.start() + wait_evt.wait() + # Make extra sure we got the thread started and vacuum is stuck, by waiting + # some time even after wait_evt got set. This isn't truly reliable (it is + # possible + time.sleep(2) + + # Insert 2 pages worth of new data. + # This should reuse the one empty page, plus another page at the end of + # the index relation; with split ordering + # old_blk -> blkno=1 -> old_blk + 1. + # As this is run while vacuum in session 3 is happening, these splits + # should receive cycle IDs where applicable. + ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;") + # unpin the btree page, allowing s3's vacuum to complete + ses2.execute("FETCH ALL FROM foo;") + ses2.execute("ROLLBACK;") + # flush WAL to make sure PS is up-to-date + ses1.execute("SELECT neon_xlogflush();") + # check that our expectations are correct + ses1.execute(BTREE_NUM_CYCLEID_PAGES) + pages = ses1.fetchall() + assert ( + len(pages) == 1 and pages[0][0] == 3 + ), f"3 page splits with cycle ID expected; actual {pages}" + + # final cleanup + ses3t.join() + ses1.close() + ses2.close() + ses3.close() diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 373f9decad93..13ff324150fc 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd +Subproject commit 13ff324150fceaac72920e01742addc053db9462 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 972e325e62b4..8736b10c1d93 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9 +Subproject commit 8736b10c1d93d11b9c0489872dd529c4c0f5338f diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index dff6615a8e48..81428621f7c0 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92 +Subproject commit 81428621f7c04aed03671cf80a928e0a36d92505 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index a10d95be6726..471c449ab8f8 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71 +Subproject commit 471c449ab8f8ff5988b6bfb9eafa0a79772ad562 diff --git a/vendor/revisions.json b/vendor/revisions.json index 8a73e14dcff0..ba0f34e23ec5 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "a10d95be67265e0f10a422ba0457f5a7af01de71" + "471c449ab8f8ff5988b6bfb9eafa0a79772ad562" ], "v16": [ "16.6", - "dff6615a8e48a10bb17a03fa3c00635f1ace7a92" + "81428621f7c04aed03671cf80a928e0a36d92505" ], "v15": [ "15.10", - "972e325e62b455957adbbdd8580e31275bb5b8c9" + "8736b10c1d93d11b9c0489872dd529c4c0f5338f" ], "v14": [ "14.15", - "373f9decad933d2d46f321231032ae8b0da81acd" + "13ff324150fceaac72920e01742addc053db9462" ] }