-
Notifications
You must be signed in to change notification settings - Fork 456
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Emit nbtree vacuum cycle id in nbtree xlog through forced FPIs (#9932)
This fixes #9929. ## Postgres repo PRS: - PG17: neondatabase/postgres#538 - PG16: neondatabase/postgres#539 - PG15: neondatabase/postgres#540 - PG14: neondatabase/postgres#541 ## Problem see #9929 ## Summary of changes We update the split code to force the code to emit an FPI whenever the cycle ID might be interesting for concurrent btree vacuum.
- Loading branch information
Showing
6 changed files
with
132 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import threading | ||
import time | ||
|
||
from fixtures.neon_fixtures import NeonEnv | ||
|
||
BTREE_NUM_CYCLEID_PAGES = """ | ||
WITH raw_pages AS ( | ||
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page | ||
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno | ||
), | ||
parsed_pages AS ( | ||
/* cycle ID is the last 2 bytes of the btree page */ | ||
SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id | ||
FROM raw_pages | ||
) | ||
SELECT count(*), | ||
encode(cycle_id, 'hex') | ||
FROM parsed_pages | ||
WHERE encode(cycle_id, 'hex') != '0000' | ||
GROUP BY encode(cycle_id, 'hex'); | ||
""" | ||
|
||
|
||
def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv): | ||
env = neon_simple_env | ||
endpoint = env.endpoints.create_start("main") | ||
|
||
ses1 = endpoint.connect().cursor() | ||
ses1.execute("ALTER SYSTEM SET autovacuum = off;") | ||
ses1.execute("ALTER SYSTEM SET enable_seqscan = off;") | ||
ses1.execute("ALTER SYSTEM SET full_page_writes = off;") | ||
ses1.execute("SELECT pg_reload_conf();") | ||
ses1.execute("CREATE EXTENSION neon_test_utils;") | ||
# prepare a large index | ||
ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);") | ||
ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);") | ||
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;") | ||
|
||
ses1.execute("SELECT neon_xlogflush();") | ||
ses1.execute(BTREE_NUM_CYCLEID_PAGES) | ||
pages = ses1.fetchall() | ||
assert ( | ||
len(pages) == 0 | ||
), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}" | ||
# Delete enough tuples to clear the first index page. | ||
# (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs. | ||
ses1.execute("DELETE FROM t WHERE id <= 406;") | ||
# Make sure the page is cleaned up | ||
ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;") | ||
|
||
# Do another delete-then-indexcleanup cycle, to move the pages from | ||
# "dead" to "reusable" | ||
ses1.execute("DELETE FROM t WHERE id <= 446;") | ||
ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;") | ||
|
||
# Make sure the vacuum we're about to trigger in s3 has cleanup work to do | ||
ses1.execute("DELETE FROM t WHERE id <= 610;") | ||
|
||
# Flush wal, for checking purposes | ||
ses1.execute("SELECT neon_xlogflush();") | ||
ses1.execute(BTREE_NUM_CYCLEID_PAGES) | ||
pages = ses1.fetchall() | ||
assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead" | ||
|
||
ses2 = endpoint.connect().cursor() | ||
ses3 = endpoint.connect().cursor() | ||
|
||
# Session 2 pins a btree page, which prevents vacuum from processing that | ||
# page, thus allowing us to reliably split pages while a concurrent vacuum | ||
# is running. | ||
ses2.execute("BEGIN;") | ||
ses2.execute( | ||
"DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC" | ||
) | ||
ses2.execute("FETCH FROM foo;") # pins the leaf page with id 611 | ||
wait_evt = threading.Event() | ||
|
||
# Session 3 runs the VACUUM command. Note that this will block, and | ||
# therefore must run on another thread. | ||
# We rely on this running quickly enough to hit the pinned page from | ||
# session 2 by the time we start other work again in session 1, but | ||
# technically there is a race where the thread (and/or PostgreSQL process) | ||
# don't get to that pinned page with vacuum until >2s after evt.set() was | ||
# called, and session 1 thus might already have split pages. | ||
def vacuum_freeze_t(ses3, evt: threading.Event): | ||
# Begin parallel vacuum that should hit the index | ||
evt.set() | ||
# this'll hang until s2 fetches enough new data from its cursor. | ||
# this is technically a race with the time.sleep(2) below, but if this | ||
# command doesn't hit | ||
ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;") | ||
|
||
ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt)) | ||
ses3t.start() | ||
wait_evt.wait() | ||
# Make extra sure we got the thread started and vacuum is stuck, by waiting | ||
# some time even after wait_evt got set. This isn't truly reliable (it is | ||
# possible | ||
time.sleep(2) | ||
|
||
# Insert 2 pages worth of new data. | ||
# This should reuse the one empty page, plus another page at the end of | ||
# the index relation; with split ordering | ||
# old_blk -> blkno=1 -> old_blk + 1. | ||
# As this is run while vacuum in session 3 is happening, these splits | ||
# should receive cycle IDs where applicable. | ||
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;") | ||
# unpin the btree page, allowing s3's vacuum to complete | ||
ses2.execute("FETCH ALL FROM foo;") | ||
ses2.execute("ROLLBACK;") | ||
# flush WAL to make sure PS is up-to-date | ||
ses1.execute("SELECT neon_xlogflush();") | ||
# check that our expectations are correct | ||
ses1.execute(BTREE_NUM_CYCLEID_PAGES) | ||
pages = ses1.fetchall() | ||
assert ( | ||
len(pages) == 1 and pages[0][0] == 3 | ||
), f"3 page splits with cycle ID expected; actual {pages}" | ||
|
||
# final cleanup | ||
ses3t.join() | ||
ses1.close() | ||
ses2.close() | ||
ses3.close() |
Submodule postgres-v14
updated
2 files
+25 −1 | src/backend/access/nbtree/nbtinsert.c | |
+65 −0 | src/backend/access/nbtree/nbtxlog.c |
Submodule postgres-v15
updated
2 files
+25 −1 | src/backend/access/nbtree/nbtinsert.c | |
+69 −0 | src/backend/access/nbtree/nbtxlog.c |
Submodule postgres-v16
updated
2 files
+25 −1 | src/backend/access/nbtree/nbtinsert.c | |
+69 −0 | src/backend/access/nbtree/nbtxlog.c |
Submodule postgres-v17
updated
2 files
+25 −1 | src/backend/access/nbtree/nbtinsert.c | |
+69 −0 | src/backend/access/nbtree/nbtxlog.c |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,18 @@ | ||
{ | ||
"v17": [ | ||
"17.2", | ||
"a10d95be67265e0f10a422ba0457f5a7af01de71" | ||
"471c449ab8f8ff5988b6bfb9eafa0a79772ad562" | ||
], | ||
"v16": [ | ||
"16.6", | ||
"dff6615a8e48a10bb17a03fa3c00635f1ace7a92" | ||
"81428621f7c04aed03671cf80a928e0a36d92505" | ||
], | ||
"v15": [ | ||
"15.10", | ||
"972e325e62b455957adbbdd8580e31275bb5b8c9" | ||
"8736b10c1d93d11b9c0489872dd529c4c0f5338f" | ||
], | ||
"v14": [ | ||
"14.15", | ||
"373f9decad933d2d46f321231032ae8b0da81acd" | ||
"13ff324150fceaac72920e01742addc053db9462" | ||
] | ||
} |
e71d20d
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
7223 tests run: 6898 passed, 0 failed, 325 skipped (full report)
Flaky tests (2)
Postgres 17
test_timeline_archival_chaos
: release-arm64test_check_visibility_map
: debug-x86-64Code coverage* (full report)
functions
:31.4% (8338 of 26537 functions)
lines
:47.7% (65639 of 137574 lines)
* collected from Rust tests only
e71d20d at 2024-12-10T21:57:20.707Z :recycle: