From e71d20d3928dd03e1159ec5a08cfa902cf85cd31 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 10 Dec 2024 20:42:52 +0100
Subject: [PATCH] Emit nbtree vacuum cycle id in nbtree xlog through forced
 FPIs (#9932)

This fixes neondatabase/neon#9929.

## Postgres repo PRS:
- PG17: https://github.com/neondatabase/postgres/pull/538
- PG16: https://github.com/neondatabase/postgres/pull/539
- PG15: https://github.com/neondatabase/postgres/pull/540
- PG14: https://github.com/neondatabase/postgres/pull/541

## Problem
see #9929

## Summary of changes

We update the split code to force the code to emit an FPI whenever the
cycle ID might be interesting for concurrent btree vacuum.
---
 .../regress/test_nbtree_pagesplit_cycleid.py  | 124 ++++++++++++++++++
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/postgres-v17                           |   2 +-
 vendor/revisions.json                         |   8 +-
 6 files changed, 132 insertions(+), 8 deletions(-)
 create mode 100644 test_runner/regress/test_nbtree_pagesplit_cycleid.py

diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
new file mode 100644
index 000000000000..558557aebad6
--- /dev/null
+++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
@@ -0,0 +1,124 @@
+import threading
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+
+BTREE_NUM_CYCLEID_PAGES = """
+    WITH raw_pages AS (
+        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
+        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
+    ),
+    parsed_pages AS (
+        /* cycle ID is the last 2 bytes of the btree page */
+        SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id
+        FROM raw_pages
+    )
+    SELECT count(*),
+           encode(cycle_id, 'hex')
+     FROM parsed_pages
+    WHERE encode(cycle_id, 'hex') != '0000'
+    GROUP BY encode(cycle_id, 'hex');
+    """
+
+
+def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    ses1 = endpoint.connect().cursor()
+    ses1.execute("ALTER SYSTEM SET autovacuum = off;")
+    ses1.execute("ALTER SYSTEM SET enable_seqscan = off;")
+    ses1.execute("ALTER SYSTEM SET full_page_writes = off;")
+    ses1.execute("SELECT pg_reload_conf();")
+    ses1.execute("CREATE EXTENSION neon_test_utils;")
+    # prepare a large index
+    ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);")
+    ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
+
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 0
+    ), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}"
+    # Delete enough tuples to clear the first index page.
+    # (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs.
+    ses1.execute("DELETE FROM t WHERE id <= 406;")
+    # Make sure the page is cleaned up
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Do another delete-then-indexcleanup cycle, to move the pages from
+    # "dead" to "reusable"
+    ses1.execute("DELETE FROM t WHERE id <= 446;")
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Make sure the vacuum we're about to trigger in s3 has cleanup work to do
+    ses1.execute("DELETE FROM t WHERE id <= 610;")
+
+    # Flush wal, for checking purposes
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
+
+    ses2 = endpoint.connect().cursor()
+    ses3 = endpoint.connect().cursor()
+
+    # Session 2 pins a btree page, which prevents vacuum from processing that
+    # page, thus allowing us to reliably split pages while a concurrent vacuum
+    # is running.
+    ses2.execute("BEGIN;")
+    ses2.execute(
+        "DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC"
+    )
+    ses2.execute("FETCH FROM foo;")  # pins the leaf page with id 611
+    wait_evt = threading.Event()
+
+    # Session 3 runs the VACUUM command. Note that this will block, and
+    # therefore must run on another thread.
+    # We rely on this running quickly enough to hit the pinned page from
+    # session 2 by the time we start other work again in session 1, but
+    # technically there is a race where the thread (and/or PostgreSQL process)
+    # don't get to that pinned page with vacuum until >2s after evt.set() was
+    # called, and session 1 thus might already have split pages.
+    def vacuum_freeze_t(ses3, evt: threading.Event):
+        # Begin parallel vacuum that should hit the index
+        evt.set()
+        # this'll hang until s2 fetches enough new data from its cursor.
+        # this is technically a race with the time.sleep(2) below, but if this
+        # command doesn't hit
+        ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;")
+
+    ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt))
+    ses3t.start()
+    wait_evt.wait()
+    # Make extra sure we got the thread started and vacuum is stuck, by waiting
+    # some time even after wait_evt got set. This isn't truly reliable (it is
+    # possible
+    time.sleep(2)
+
+    # Insert 2 pages worth of new data.
+    # This should reuse the one empty page, plus another page at the end of
+    # the index relation; with split ordering
+    #    old_blk -> blkno=1 -> old_blk + 1.
+    # As this is run while vacuum in session 3 is happening, these splits
+    # should receive cycle IDs where applicable.
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;")
+    # unpin the btree page, allowing s3's vacuum to complete
+    ses2.execute("FETCH ALL FROM foo;")
+    ses2.execute("ROLLBACK;")
+    # flush WAL to make sure PS is up-to-date
+    ses1.execute("SELECT neon_xlogflush();")
+    # check that our expectations are correct
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 1 and pages[0][0] == 3
+    ), f"3 page splits with cycle ID expected; actual {pages}"
+
+    # final cleanup
+    ses3t.join()
+    ses1.close()
+    ses2.close()
+    ses3.close()
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 373f9decad93..13ff324150fc 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd
+Subproject commit 13ff324150fceaac72920e01742addc053db9462
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 972e325e62b4..8736b10c1d93 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9
+Subproject commit 8736b10c1d93d11b9c0489872dd529c4c0f5338f
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index dff6615a8e48..81428621f7c0 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92
+Subproject commit 81428621f7c04aed03671cf80a928e0a36d92505
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a10d95be6726..471c449ab8f8 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71
+Subproject commit 471c449ab8f8ff5988b6bfb9eafa0a79772ad562
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 8a73e14dcff0..ba0f34e23ec5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "a10d95be67265e0f10a422ba0457f5a7af01de71"
+    "471c449ab8f8ff5988b6bfb9eafa0a79772ad562"
   ],
   "v16": [
     "16.6",
-    "dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
+    "81428621f7c04aed03671cf80a928e0a36d92505"
   ],
   "v15": [
     "15.10",
-    "972e325e62b455957adbbdd8580e31275bb5b8c9"
+    "8736b10c1d93d11b9c0489872dd529c4c0f5338f"
   ],
   "v14": [
     "14.15",
-    "373f9decad933d2d46f321231032ae8b0da81acd"
+    "13ff324150fceaac72920e01742addc053db9462"
   ]
 }