From 2451969d5c533ab4ca085cdce12fdd79d55cca8b Mon Sep 17 00:00:00 2001 From: Rahul Patil Date: Fri, 13 Dec 2024 13:22:15 +0100 Subject: [PATCH 01/56] fix(ci): Allow github-action-script to post reports (#10136) Allow github-action-script to post reports. Failed CI: https://github.com/neondatabase/neon/actions/runs/12304655364/job/34342554049#step:13:514 --- .github/workflows/build_and_test.yml | 15 +++++++++++++++ .github/workflows/periodic_pagebench.yml | 8 +++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a3943cba9127..b3556debe3fa 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -303,6 +303,11 @@ jobs: benchmarks: if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -343,6 +348,11 @@ jobs: report-benchmarks-failures: needs: [ benchmarks, create-test-report ] if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure' + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write runs-on: ubuntu-22.04 steps: @@ -1024,6 +1034,11 @@ jobs: trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] runs-on: ubuntu-22.04 + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write steps: - name: Set PR's status to pending and request a remote CI test run: | diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 9f5a16feca6e..049990f17beb 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -21,15 +21,17 @@ defaults: run: shell: bash -euo pipefail {0} -permissions: - id-token: write # aws-actions/configure-aws-credentials - concurrency: group: ${{ github.workflow }} cancel-in-progress: false jobs: trigger_bench_on_ec2_machine_in_eu_central_1: + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write runs-on: [ self-hosted, small ] container: image: neondatabase/build-tools:pinned-bookworm From 7dc382601cb9f2c508f11acdac897a8a0bbbdce3 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:59:04 +0100 Subject: [PATCH 02/56] Fix pg_regress tests on a cloud staging instance (#10134) ## Problem pg_regress tests start failing due to unique ids added to Neon error messages ## Summary of changes Patches updated --- .github/workflows/cloud-regress.yml | 1 + compute/patches/cloud_regress_pg16.patch | 173 ++++++++++++---------- compute/patches/cloud_regress_pg17.patch | 179 ++++++++++++----------- 3 files changed, 190 insertions(+), 163 deletions(-) diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 2fc26baa212a..7b9e434ec3ff 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -99,6 +99,7 @@ jobs: BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}} - name: Delete branch + if: always() uses: ./.github/actions/neon-branch-delete with: api_key: ${{ secrets.NEON_STAGING_API_KEY }} diff --git a/compute/patches/cloud_regress_pg16.patch b/compute/patches/cloud_regress_pg16.patch index a4b93d0260a3..3f0bb84ae737 100644 --- a/compute/patches/cloud_regress_pg16.patch +++ b/compute/patches/cloud_regress_pg16.patch @@ -981,7 +981,7 @@ index fc42d418bf..e38f517574 100644 CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out -index 8475231735..1afae5395f 100644 +index 8475231735..0653946337 100644 --- a/src/test/regress/expected/password.out +++ b/src/test/regress/expected/password.out @@ -12,11 +12,11 @@ SET password_encryption = 'md5'; -- ok @@ -1006,65 +1006,63 @@ index 8475231735..1afae5395f 100644 -----------------+--------------------------------------------------- - regress_passwd1 | md5783277baca28003b33453252be4dbb34 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3 -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: - regress_passwd4 | + regress_passwd4 | SCRAM-SHA-256$4096:$: (4 rows) -- Rename a role -@@ -54,24 +54,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -54,24 +54,16 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; -- passwords. SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +--- already encrypted with MD5, use as it is +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- This looks like a valid SCRAM-SHA-256 secret, but it is not - -- so it should be hashed with SCRAM-SHA-256. - CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- These may look like valid MD5 secrets, but they are not, so they - -- should be hashed with SCRAM-SHA-256. - -- trailing garbage at the end - CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- invalid length - CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount'; -@@ -81,63 +87,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -81,11 +73,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: regress_passwd4 | SCRAM-SHA-256$4096:$: - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023 -- regress_passwd6 | SCRAM-SHA-256$4096:$: -- regress_passwd7 | SCRAM-SHA-256$4096:$: -- regress_passwd8 | SCRAM-SHA-256$4096:$: - regress_passwd9 | SCRAM-SHA-256$1024:$: --(9 rows) -+(5 rows) - ++ regress_passwd5 | SCRAM-SHA-256$4096:$: + regress_passwd6 | SCRAM-SHA-256$4096:$: + regress_passwd7 | SCRAM-SHA-256$4096:$: + regress_passwd8 | SCRAM-SHA-256$4096:$: +@@ -95,23 +87,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ -- An empty password is not allowed, in any form CREATE ROLE regress_passwd_empty PASSWORD ''; NOTICE: empty string is not a valid password, clearing password @@ -1082,56 +1080,37 @@ index 8475231735..1afae5395f 100644 -(1 row) +(0 rows) - -- Test with invalid stored and server keys. - -- - -- The first is valid, to act as a control. The others have too long - -- stored/server keys. They will be re-hashed. - CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed - FROM pg_authid - WHERE rolname LIKE 'regress_passwd_sha_len%' +@@ -120,7 +109,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw ORDER BY rolname; -- rolname | is_rolpassword_rehashed ---------------------------+------------------------- + rolname | is_rolpassword_rehashed + -------------------------+------------------------- - regress_passwd_sha_len0 | f -- regress_passwd_sha_len1 | t -- regress_passwd_sha_len2 | t --(3 rows) -+ rolname | is_rolpassword_rehashed -+---------+------------------------- -+(0 rows) - - DROP ROLE regress_passwd1; - DROP ROLE regress_passwd2; - DROP ROLE regress_passwd3; - DROP ROLE regress_passwd4; - DROP ROLE regress_passwd5; -+ERROR: role "regress_passwd5" does not exist - DROP ROLE regress_passwd6; -+ERROR: role "regress_passwd6" does not exist - DROP ROLE regress_passwd7; -+ERROR: role "regress_passwd7" does not exist ++ regress_passwd_sha_len0 | t + regress_passwd_sha_len1 | t + regress_passwd_sha_len2 | t + (3 rows) +@@ -135,6 +124,7 @@ DROP ROLE regress_passwd7; DROP ROLE regress_passwd8; -+ERROR: role "regress_passwd8" does not exist DROP ROLE regress_passwd9; DROP ROLE regress_passwd_empty; +ERROR: role "regress_passwd_empty" does not exist DROP ROLE regress_passwd_sha_len0; -+ERROR: role "regress_passwd_sha_len0" does not exist DROP ROLE regress_passwd_sha_len1; -+ERROR: role "regress_passwd_sha_len1" does not exist DROP ROLE regress_passwd_sha_len2; -+ERROR: role "regress_passwd_sha_len2" does not exist - -- all entries should have been removed - SELECT rolname, rolpassword - FROM pg_authid diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 5b9dba7b32..cc408dad42 100644 --- a/src/test/regress/expected/privileges.out @@ -3194,7 +3173,7 @@ index 1a6c61f49d..1c31ac6a53 100644 -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql -index 53e86b0b6c..f07cf1ec54 100644 +index 53e86b0b6c..0303fdfe96 100644 --- a/src/test/regress/sql/password.sql +++ b/src/test/regress/sql/password.sql @@ -10,11 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok @@ -3213,23 +3192,59 @@ index 53e86b0b6c..f07cf1ec54 100644 -- check list of created entries -- -@@ -42,14 +42,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -42,26 +42,18 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +--- already encrypted with MD5, use as it is +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; + +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Changing the SCRAM iteration count + SET scram_iterations = 1024; +@@ -78,13 +70,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; + ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; + SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Check that the invalid secrets were re-hashed. A re-hashed secret + -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 249df17a58..b258e7f26a 100644 --- a/src/test/regress/sql/privileges.sql diff --git a/compute/patches/cloud_regress_pg17.patch b/compute/patches/cloud_regress_pg17.patch index cbe84ef54be7..e57447a2c6ee 100644 --- a/compute/patches/cloud_regress_pg17.patch +++ b/compute/patches/cloud_regress_pg17.patch @@ -1014,10 +1014,10 @@ index fc42d418bf..e38f517574 100644 CREATE SCHEMA addr_nsp; SET search_path TO 'addr_nsp'; diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out -index 924d6e001d..5966531db6 100644 +index 924d6e001d..7fdda73439 100644 --- a/src/test/regress/expected/password.out +++ b/src/test/regress/expected/password.out -@@ -12,13 +12,13 @@ SET password_encryption = 'md5'; -- ok +@@ -12,13 +12,11 @@ SET password_encryption = 'md5'; -- ok SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; @@ -1026,9 +1026,7 @@ index 924d6e001d..5966531db6 100644 -CREATE ROLE regress_passwd2; -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; @@ -1037,71 +1035,69 @@ index 924d6e001d..5966531db6 100644 -- check list of created entries -- -- The scram secret will look something like: -@@ -32,10 +32,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -32,10 +30,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5783277baca28003b33453252be4dbb34 - regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3 -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: - regress_passwd4 | + regress_passwd4 | SCRAM-SHA-256$4096:$: (4 rows) -- Rename a role -@@ -56,24 +56,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -56,24 +54,17 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; -- passwords. SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- This looks like a valid SCRAM-SHA-256 secret, but it is not - -- so it should be hashed with SCRAM-SHA-256. - CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- These may look like valid MD5 secrets, but they are not, so they - -- should be hashed with SCRAM-SHA-256. - -- trailing garbage at the end - CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - -- invalid length - CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Changing the SCRAM iteration count SET scram_iterations = 1024; CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount'; -@@ -83,63 +89,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ +@@ -83,11 +74,11 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ ORDER BY rolname, rolpassword; rolname | rolpassword_masked -----------------+--------------------------------------------------- - regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70 - regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb -+ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1 -+ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2 ++ regress_passwd1 | NEON_MD5_PLACEHOLDER:regress_passwd1 ++ regress_passwd2 | NEON_MD5_PLACEHOLDER:regress_passwd2 regress_passwd3 | SCRAM-SHA-256$4096:$: regress_passwd4 | SCRAM-SHA-256$4096:$: - regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023 -- regress_passwd6 | SCRAM-SHA-256$4096:$: -- regress_passwd7 | SCRAM-SHA-256$4096:$: -- regress_passwd8 | SCRAM-SHA-256$4096:$: - regress_passwd9 | SCRAM-SHA-256$1024:$: --(9 rows) -+(5 rows) - ++ regress_passwd5 | SCRAM-SHA-256$4096:$: + regress_passwd6 | SCRAM-SHA-256$4096:$: + regress_passwd7 | SCRAM-SHA-256$4096:$: + regress_passwd8 | SCRAM-SHA-256$4096:$: +@@ -97,23 +88,20 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+ -- An empty password is not allowed, in any form CREATE ROLE regress_passwd_empty PASSWORD ''; NOTICE: empty string is not a valid password, clearing password @@ -1119,56 +1115,37 @@ index 924d6e001d..5966531db6 100644 -(1 row) +(0 rows) - -- Test with invalid stored and server keys. - -- - -- The first is valid, to act as a control. The others have too long - -- stored/server keys. They will be re-hashed. - CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} - CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; -+ERROR: Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"} +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- Check that the invalid secrets were re-hashed. A re-hashed secret -- should not contain the original salt. SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed - FROM pg_authid - WHERE rolname LIKE 'regress_passwd_sha_len%' +@@ -122,7 +110,7 @@ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassw ORDER BY rolname; -- rolname | is_rolpassword_rehashed ---------------------------+------------------------- + rolname | is_rolpassword_rehashed + -------------------------+------------------------- - regress_passwd_sha_len0 | f -- regress_passwd_sha_len1 | t -- regress_passwd_sha_len2 | t --(3 rows) -+ rolname | is_rolpassword_rehashed -+---------+------------------------- -+(0 rows) - - DROP ROLE regress_passwd1; - DROP ROLE regress_passwd2; - DROP ROLE regress_passwd3; - DROP ROLE regress_passwd4; - DROP ROLE regress_passwd5; -+ERROR: role "regress_passwd5" does not exist - DROP ROLE regress_passwd6; -+ERROR: role "regress_passwd6" does not exist - DROP ROLE regress_passwd7; -+ERROR: role "regress_passwd7" does not exist ++ regress_passwd_sha_len0 | t + regress_passwd_sha_len1 | t + regress_passwd_sha_len2 | t + (3 rows) +@@ -137,6 +125,7 @@ DROP ROLE regress_passwd7; DROP ROLE regress_passwd8; -+ERROR: role "regress_passwd8" does not exist DROP ROLE regress_passwd9; DROP ROLE regress_passwd_empty; +ERROR: role "regress_passwd_empty" does not exist DROP ROLE regress_passwd_sha_len0; -+ERROR: role "regress_passwd_sha_len0" does not exist DROP ROLE regress_passwd_sha_len1; -+ERROR: role "regress_passwd_sha_len1" does not exist DROP ROLE regress_passwd_sha_len2; -+ERROR: role "regress_passwd_sha_len2" does not exist - -- all entries should have been removed - SELECT rolname, rolpassword - FROM pg_authid diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 1296da0d57..f43fffa44c 100644 --- a/src/test/regress/expected/privileges.out @@ -3249,10 +3226,10 @@ index 1a6c61f49d..1c31ac6a53 100644 -- Test generic object addressing/identification functions CREATE SCHEMA addr_nsp; diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql -index bb82aa4aa2..7424c91b10 100644 +index bb82aa4aa2..dd8a05e24d 100644 --- a/src/test/regress/sql/password.sql +++ b/src/test/regress/sql/password.sql -@@ -10,13 +10,13 @@ SET password_encryption = 'scram-sha-256'; -- ok +@@ -10,13 +10,11 @@ SET password_encryption = 'scram-sha-256'; -- ok -- consistency of password entries SET password_encryption = 'md5'; @@ -3261,9 +3238,7 @@ index bb82aa4aa2..7424c91b10 100644 -CREATE ROLE regress_passwd2; -ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2'; +CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER; +CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; -+ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; SET password_encryption = 'scram-sha-256'; -CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3'; -CREATE ROLE regress_passwd4 PASSWORD NULL; @@ -3272,23 +3247,59 @@ index bb82aa4aa2..7424c91b10 100644 -- check list of created entries -- -@@ -44,14 +44,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; +@@ -44,26 +42,19 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2; SET password_encryption = 'md5'; -- encrypt with MD5 -ALTER ROLE regress_passwd2 PASSWORD 'foo'; +--- already encrypted, use as they are +-ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; +-ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; +ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER; - -- already encrypted, use as they are - ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70'; - ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo='; SET password_encryption = 'scram-sha-256'; -- create SCRAM secret -ALTER ROLE regress_passwd4 PASSWORD 'foo'; +ALTER ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER; -- already encrypted with MD5, use as it is - CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; +-CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd5 PASSWORD NEON_PASSWORD_PLACEHOLDER; + +--- This looks like a valid SCRAM-SHA-256 secret, but it is not +--- so it should be hashed with SCRAM-SHA-256. +-CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234'; +--- These may look like valid MD5 secrets, but they are not, so they +--- should be hashed with SCRAM-SHA-256. +--- trailing garbage at the end +-CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz'; +--- invalid length +-CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz'; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd6 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd7 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd8 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Changing the SCRAM iteration count + SET scram_iterations = 1024; +@@ -80,13 +71,10 @@ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a'; + ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4='; + SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty'; +--- Test with invalid stored and server keys. +--- +--- The first is valid, to act as a control. The others have too long +--- stored/server keys. They will be re-hashed. +-CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI='; +-CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA='; ++-- Neon does not support encrypted passwords, use unencrypted instead ++CREATE ROLE regress_passwd_sha_len0 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len1 PASSWORD NEON_PASSWORD_PLACEHOLDER; ++CREATE ROLE regress_passwd_sha_len2 PASSWORD NEON_PASSWORD_PLACEHOLDER; + + -- Check that the invalid secrets were re-hashed. A re-hashed secret + -- should not contain the original salt. diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 5880bc018d..27aa952b18 100644 --- a/src/test/regress/sql/privileges.sql From ce8eb089f3d002e5057f860454adeb0993431a19 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 13 Dec 2024 17:06:27 +0300 Subject: [PATCH 03/56] Extract public sk types to safekeeper_api (#10137) ## Problem We want to extract safekeeper http client to separate crate for use in storage controller and neon_local. However, many types used in the API are internal to safekeeper. ## Summary of changes Move them to safekeeper_api crate. No functional changes. ref https://github.com/neondatabase/neon/issues/9011 --- Cargo.lock | 3 + libs/safekeeper_api/Cargo.toml | 5 +- libs/safekeeper_api/src/lib.rs | 17 ++ libs/safekeeper_api/src/models.rs | 170 +++++++++++++++++- safekeeper/src/control_file_upgrade.rs | 3 +- safekeeper/src/debug_dump.rs | 2 +- safekeeper/src/handler.rs | 4 +- safekeeper/src/http/client.rs | 3 +- safekeeper/src/http/routes.rs | 72 ++------ safekeeper/src/json_ctrl.rs | 5 +- safekeeper/src/pull_timeline.rs | 7 +- safekeeper/src/receive_wal.rs | 21 +-- safekeeper/src/recovery.rs | 8 +- safekeeper/src/safekeeper.rs | 19 +- safekeeper/src/send_wal.rs | 89 +-------- safekeeper/src/state.rs | 7 +- safekeeper/src/timeline.rs | 55 ++---- safekeeper/src/timeline_manager.rs | 4 +- safekeeper/src/timelines_global_map.rs | 2 +- safekeeper/src/wal_backup.rs | 3 +- safekeeper/src/wal_backup_partial.rs | 2 +- safekeeper/src/wal_reader_stream.rs | 2 +- safekeeper/src/wal_service.rs | 3 +- .../tests/walproposer_sim/safekeeper.rs | 3 +- 24 files changed, 264 insertions(+), 245 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2d5e03613b1..c4f80f63c9be 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5565,7 +5565,10 @@ name = "safekeeper_api" version = "0.1.0" dependencies = [ "const_format", + "postgres_ffi", + "pq_proto", "serde", + "tokio", "utils", ] diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index 14811232d33b..4234ec6779a2 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -5,6 +5,9 @@ edition.workspace = true license.workspace = true [dependencies] -serde.workspace = true const_format.workspace = true +serde.workspace = true +postgres_ffi.workspace = true +pq_proto.workspace = true +tokio.workspace = true utils.workspace = true diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs index 63c2c51188b8..be6923aca902 100644 --- a/libs/safekeeper_api/src/lib.rs +++ b/libs/safekeeper_api/src/lib.rs @@ -1,10 +1,27 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use const_format::formatcp; +use pq_proto::SystemId; +use serde::{Deserialize, Serialize}; /// Public API types pub mod models; +/// Consensus logical timestamp. Note: it is a part of sk control file. +pub type Term = u64; +pub const INVALID_TERM: Term = 0; + +/// Information about Postgres. Safekeeper gets it once and then verifies all +/// further connections from computes match. Note: it is a part of sk control +/// file. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ServerInfo { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + pub wal_seg_size: u32, +} + pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 28666d197afd..3e424a792c7f 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -1,10 +1,23 @@ +//! Types used in safekeeper http API. Many of them are also reused internally. + +use postgres_ffi::TimestampTz; use serde::{Deserialize, Serialize}; +use std::net::SocketAddr; +use tokio::time::Instant; use utils::{ - id::{NodeId, TenantId, TimelineId}, + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, + pageserver_feedback::PageserverFeedback, }; +use crate::{ServerInfo, Term}; + +#[derive(Debug, Serialize)] +pub struct SafekeeperStatus { + pub id: NodeId, +} + #[derive(Serialize, Deserialize)] pub struct TimelineCreateRequest { pub tenant_id: TenantId, @@ -18,6 +31,161 @@ pub struct TimelineCreateRequest { pub local_start_lsn: Option, } +/// Same as TermLsn, but serializes LSN using display serializer +/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct TermSwitchApiEntry { + pub term: Term, + pub lsn: Lsn, +} + +/// Augment AcceptorState with last_log_term for convenience +#[derive(Debug, Serialize, Deserialize)] +pub struct AcceptorStateStatus { + pub term: Term, + pub epoch: Term, // aka last_log_term, old `epoch` name is left for compatibility + pub term_history: Vec, +} + +/// Things safekeeper should know about timeline state on peers. +/// Used as both model and internally. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PeerInfo { + pub sk_id: NodeId, + pub term: Term, + /// Term of the last entry. + pub last_log_term: Term, + /// LSN of the last record. + pub flush_lsn: Lsn, + pub commit_lsn: Lsn, + /// Since which LSN safekeeper has WAL. + pub local_start_lsn: Lsn, + /// When info was received. Serde annotations are not very useful but make + /// the code compile -- we don't rely on this field externally. + #[serde(skip)] + #[serde(default = "Instant::now")] + pub ts: Instant, + pub pg_connstr: String, + pub http_connstr: String, +} + +pub type FullTransactionId = u64; + +/// Hot standby feedback received from replica +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct HotStandbyFeedback { + pub ts: TimestampTz, + pub xmin: FullTransactionId, + pub catalog_xmin: FullTransactionId, +} + +pub const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; + +impl HotStandbyFeedback { + pub fn empty() -> HotStandbyFeedback { + HotStandbyFeedback { + ts: 0, + xmin: 0, + catalog_xmin: 0, + } + } +} + +/// Standby status update +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct StandbyReply { + pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. + pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. + pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. + pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. + pub reply_requested: bool, +} + +impl StandbyReply { + pub fn empty() -> Self { + StandbyReply { + write_lsn: Lsn::INVALID, + flush_lsn: Lsn::INVALID, + apply_lsn: Lsn::INVALID, + reply_ts: 0, + reply_requested: false, + } + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct StandbyFeedback { + pub reply: StandbyReply, + pub hs_feedback: HotStandbyFeedback, +} + +impl StandbyFeedback { + pub fn empty() -> Self { + StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: HotStandbyFeedback::empty(), + } + } +} + +/// Receiver is either pageserver or regular standby, which have different +/// feedbacks. +/// Used as both model and internally. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub enum ReplicationFeedback { + Pageserver(PageserverFeedback), + Standby(StandbyFeedback), +} + +/// Uniquely identifies a WAL service connection. Logged in spans for +/// observability. +pub type ConnectionId = u32; + +/// Serialize is used only for json'ing in API response. Also used internally. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalSenderState { + pub ttid: TenantTimelineId, + pub addr: SocketAddr, + pub conn_id: ConnectionId, + // postgres application_name + pub appname: Option, + pub feedback: ReplicationFeedback, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalReceiverState { + /// None means it is recovery initiated by us (this safekeeper). + pub conn_id: Option, + pub status: WalReceiverStatus, +} + +/// Walreceiver status. Currently only whether it passed voting stage and +/// started receiving the stream, but it is easy to add more if needed. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum WalReceiverStatus { + Voting, + Streaming, +} + +/// Info about timeline on safekeeper ready for reporting. +#[derive(Debug, Serialize, Deserialize)] +pub struct TimelineStatus { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub acceptor_state: AcceptorStateStatus, + pub pg_info: ServerInfo, + pub flush_lsn: Lsn, + pub timeline_start_lsn: Lsn, + pub local_start_lsn: Lsn, + pub commit_lsn: Lsn, + pub backup_lsn: Lsn, + pub peer_horizon_lsn: Lsn, + pub remote_consistent_lsn: Lsn, + pub peers: Vec, + pub walsenders: Vec, + pub walreceivers: Vec, +} + fn lsn_invalid() -> Lsn { Lsn::INVALID } diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index a4b4670e423b..dd152fd4cce8 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,11 +1,12 @@ //! Code to deal with safekeeper control file upgrades use crate::{ - safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, + safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn}, state::{EvictionState, PersistedPeers, TimelinePersistentState}, wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::*; use utils::{ diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 93011eddec07..19362a0992d4 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -14,6 +14,7 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use postgres_ffi::XLogSegNo; use postgres_ffi::MAX_SEND_SIZE; +use safekeeper_api::models::WalSenderState; use serde::Deserialize; use serde::Serialize; @@ -25,7 +26,6 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use crate::safekeeper::TermHistory; -use crate::send_wal::WalSenderState; use crate::state::TimelineMemState; use crate::state::TimelinePersistentState; use crate::timeline::get_timeline_dir; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 2ca6333ba835..bb639bfb3221 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -4,6 +4,8 @@ use anyhow::Context; use pageserver_api::models::ShardParameters; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; +use safekeeper_api::models::ConnectionId; +use safekeeper_api::Term; use std::future::Future; use std::str::{self, FromStr}; use std::sync::Arc; @@ -16,9 +18,7 @@ use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE}; -use crate::safekeeper::Term; use crate::timeline::TimelineError; -use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; use postgres_backend::PostgresBackend; use postgres_backend::QueryError; diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs index a166fc1ab9b0..669a9c0ce94b 100644 --- a/safekeeper/src/http/client.rs +++ b/safekeeper/src/http/client.rs @@ -8,6 +8,7 @@ //! etc. use reqwest::{IntoUrl, Method, StatusCode}; +use safekeeper_api::models::TimelineStatus; use std::error::Error as _; use utils::{ http::error::HttpErrorBody, @@ -15,8 +16,6 @@ use utils::{ logging::SecretString, }; -use super::routes::TimelineStatus; - #[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 71c36f1d4631..9bc1bf340919 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,5 +1,9 @@ use hyper::{Body, Request, Response, StatusCode}; -use serde::{Deserialize, Serialize}; +use safekeeper_api::models::AcceptorStateStatus; +use safekeeper_api::models::SafekeeperStatus; +use safekeeper_api::models::TermSwitchApiEntry; +use safekeeper_api::models::TimelineStatus; +use safekeeper_api::ServerInfo; use std::collections::HashMap; use std::fmt; use std::io::Write as _; @@ -31,26 +35,17 @@ use utils::{ request::{ensure_no_body, parse_request_param}, RequestExt, RouterBuilder, }, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, }; use crate::debug_dump::TimelineDigestRequest; -use crate::receive_wal::WalReceiverState; -use crate::safekeeper::Term; -use crate::safekeeper::{ServerInfo, TermLsn}; -use crate::send_wal::WalSenderState; -use crate::timeline::PeerInfo; +use crate::safekeeper::TermLsn; use crate::timelines_global_map::TimelineDeleteForceResult; use crate::GlobalTimelines; use crate::SafeKeeperConf; use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; -#[derive(Debug, Serialize)] -struct SafekeeperStatus { - id: NodeId, -} - /// Healthcheck handler. async fn status_handler(request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -73,50 +68,6 @@ fn get_global_timelines(request: &Request) -> Arc { .clone() } -/// Same as TermLsn, but serializes LSN using display serializer -/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct TermSwitchApiEntry { - pub term: Term, - pub lsn: Lsn, -} - -impl From for TermLsn { - fn from(api_val: TermSwitchApiEntry) -> Self { - TermLsn { - term: api_val.term, - lsn: api_val.lsn, - } - } -} - -/// Augment AcceptorState with last_log_term for convenience -#[derive(Debug, Serialize, Deserialize)] -pub struct AcceptorStateStatus { - pub term: Term, - pub epoch: Term, // aka last_log_term - pub term_history: Vec, -} - -/// Info about timeline on safekeeper ready for reporting. -#[derive(Debug, Serialize, Deserialize)] -pub struct TimelineStatus { - pub tenant_id: TenantId, - pub timeline_id: TimelineId, - pub acceptor_state: AcceptorStateStatus, - pub pg_info: ServerInfo, - pub flush_lsn: Lsn, - pub timeline_start_lsn: Lsn, - pub local_start_lsn: Lsn, - pub commit_lsn: Lsn, - pub backup_lsn: Lsn, - pub peer_horizon_lsn: Lsn, - pub remote_consistent_lsn: Lsn, - pub peers: Vec, - pub walsenders: Vec, - pub walreceivers: Vec, -} - fn check_permission(request: &Request, tenant_id: Option) -> Result<(), ApiError> { check_permission_with(request, |claims| { crate::auth::check_permission(claims, tenant_id) @@ -187,6 +138,15 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, res) } +impl From for TermLsn { + fn from(api_val: TermSwitchApiEntry) -> Self { + TermLsn { + term: api_val.term, + lsn: api_val.lsn, + } + } +} + /// Report info about timeline. async fn timeline_status_handler(request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index dc4ad3706e6c..256e350ceba5 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -8,16 +8,17 @@ use anyhow::Context; use postgres_backend::QueryError; +use safekeeper_api::{ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::*; use crate::handler::SafekeeperPostgresHandler; -use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; +use crate::safekeeper::{AcceptorProposerMessage, AppendResponse}; use crate::safekeeper::{ AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, }; -use crate::safekeeper::{Term, TermHistory, TermLsn}; +use crate::safekeeper::{TermHistory, TermLsn}; use crate::state::TimelinePersistentState; use crate::timeline::WalResidentTimeline; use postgres_backend::PostgresBackend; diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index f58a9dca1dbc..00777273cbf9 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -4,6 +4,7 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use safekeeper_api::{models::TimelineStatus, Term}; use serde::{Deserialize, Serialize}; use std::{ cmp::min, @@ -21,11 +22,7 @@ use tracing::{error, info, instrument}; use crate::{ control_file::CONTROL_FILE_NAME, debug_dump, - http::{ - client::{self, Client}, - routes::TimelineStatus, - }, - safekeeper::Term, + http::client::{self, Client}, state::{EvictionState, TimelinePersistentState}, timeline::{Timeline, WalResidentTimeline}, timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 2a49890d618f..08371177cd24 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -9,9 +9,7 @@ use crate::metrics::{ }; use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; -use crate::safekeeper::ServerInfo; use crate::timeline::WalResidentTimeline; -use crate::wal_service::ConnectionId; use crate::GlobalTimelines; use anyhow::{anyhow, Context}; use bytes::BytesMut; @@ -23,8 +21,8 @@ use postgres_backend::PostgresBackend; use postgres_backend::PostgresBackendReader; use postgres_backend::QueryError; use pq_proto::BeMessage; -use serde::Deserialize; -use serde::Serialize; +use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus}; +use safekeeper_api::ServerInfo; use std::future; use std::net::SocketAddr; use std::sync::Arc; @@ -171,21 +169,6 @@ impl WalReceiversShared { } } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WalReceiverState { - /// None means it is recovery initiated by us (this safekeeper). - pub conn_id: Option, - pub status: WalReceiverStatus, -} - -/// Walreceiver status. Currently only whether it passed voting stage and -/// started receiving the stream, but it is easy to add more if needed. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum WalReceiverStatus { - Voting, - Streaming, -} - /// Scope guard to access slot in WalReceivers registry and unregister from /// it in Drop. pub struct WalReceiverGuard { diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index 7b87166aa052..61647c16b00a 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -7,6 +7,8 @@ use std::{fmt, pin::pin}; use anyhow::{bail, Context}; use futures::StreamExt; use postgres_protocol::message::backend::ReplicationMessage; +use safekeeper_api::models::{PeerInfo, TimelineStatus}; +use safekeeper_api::Term; use tokio::sync::mpsc::{channel, Receiver, Sender}; use tokio::time::timeout; use tokio::{ @@ -24,13 +26,11 @@ use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; use crate::safekeeper::{AppendRequest, AppendRequestHeader}; use crate::timeline::WalResidentTimeline; use crate::{ - http::routes::TimelineStatus, receive_wal::MSG_QUEUE_SIZE, safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory, - TermLsn, VoteRequest, + AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, TermHistory, TermLsn, + VoteRequest, }, - timeline::PeerInfo, SafeKeeperConf, }; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 6eb69f0b7ce2..ccd7940c7212 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -5,6 +5,9 @@ use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; +use safekeeper_api::models::HotStandbyFeedback; +use safekeeper_api::Term; +use safekeeper_api::INVALID_TERM; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; @@ -16,7 +19,6 @@ use tracing::*; use crate::control_file; use crate::metrics::MISC_OPERATION_SECONDS; -use crate::send_wal::HotStandbyFeedback; use crate::state::TimelineState; use crate::wal_storage; @@ -31,10 +33,6 @@ use utils::{ const SK_PROTOCOL_VERSION: u32 = 2; pub const UNKNOWN_SERVER_VERSION: u32 = 0; -/// Consensus logical timestamp. -pub type Term = u64; -pub const INVALID_TERM: Term = 0; - #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] pub struct TermLsn { pub term: Term, @@ -198,16 +196,6 @@ impl AcceptorState { } } -/// Information about Postgres. Safekeeper gets it once and then verifies -/// all further connections from computes match. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ServerInfo { - /// Postgres server version - pub pg_version: u32, - pub system_id: SystemId, - pub wal_seg_size: u32, -} - #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct PersistedPeerInfo { /// LSN up to which safekeeper offloaded WAL to s3. @@ -1041,6 +1029,7 @@ where mod tests { use futures::future::BoxFuture; use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; + use safekeeper_api::ServerInfo; use super::*; use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState}; diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 0887cf726418..84632219984a 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -4,11 +4,10 @@ use crate::handler::SafekeeperPostgresHandler; use crate::metrics::RECEIVED_PS_FEEDBACKS; use crate::receive_wal::WalReceivers; -use crate::safekeeper::{Term, TermLsn}; +use crate::safekeeper::TermLsn; use crate::send_interpreted_wal::InterpretedWalSender; use crate::timeline::WalResidentTimeline; use crate::wal_reader_stream::WalReaderStreamBuilder; -use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use anyhow::{bail, Context as AnyhowContext}; use bytes::Bytes; @@ -19,7 +18,11 @@ use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; -use serde::{Deserialize, Serialize}; +use safekeeper_api::models::{ + ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply, + WalSenderState, INVALID_FULL_TRANSACTION_ID, +}; +use safekeeper_api::Term; use tokio::io::{AsyncRead, AsyncWrite}; use utils::failpoint_support; use utils::id::TenantTimelineId; @@ -28,7 +31,6 @@ use utils::postgres_client::PostgresClientProtocol; use std::cmp::{max, min}; use std::net::SocketAddr; -use std::str; use std::sync::Arc; use std::time::Duration; use tokio::sync::watch::Receiver; @@ -42,65 +44,6 @@ const STANDBY_STATUS_UPDATE_TAG_BYTE: u8 = b'r'; // neon extension of replication protocol const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z'; -type FullTransactionId = u64; - -/// Hot standby feedback received from replica -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct HotStandbyFeedback { - pub ts: TimestampTz, - pub xmin: FullTransactionId, - pub catalog_xmin: FullTransactionId, -} - -const INVALID_FULL_TRANSACTION_ID: FullTransactionId = 0; - -impl HotStandbyFeedback { - pub fn empty() -> HotStandbyFeedback { - HotStandbyFeedback { - ts: 0, - xmin: 0, - catalog_xmin: 0, - } - } -} - -/// Standby status update -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct StandbyReply { - pub write_lsn: Lsn, // The location of the last WAL byte + 1 received and written to disk in the standby. - pub flush_lsn: Lsn, // The location of the last WAL byte + 1 flushed to disk in the standby. - pub apply_lsn: Lsn, // The location of the last WAL byte + 1 applied in the standby. - pub reply_ts: TimestampTz, // The client's system clock at the time of transmission, as microseconds since midnight on 2000-01-01. - pub reply_requested: bool, -} - -impl StandbyReply { - fn empty() -> Self { - StandbyReply { - write_lsn: Lsn::INVALID, - flush_lsn: Lsn::INVALID, - apply_lsn: Lsn::INVALID, - reply_ts: 0, - reply_requested: false, - } - } -} - -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -pub struct StandbyFeedback { - pub reply: StandbyReply, - pub hs_feedback: HotStandbyFeedback, -} - -impl StandbyFeedback { - pub fn empty() -> Self { - StandbyFeedback { - reply: StandbyReply::empty(), - hs_feedback: HotStandbyFeedback::empty(), - } - } -} - /// WalSenders registry. Timeline holds it (wrapped in Arc). pub struct WalSenders { mutex: Mutex, @@ -341,25 +284,6 @@ impl WalSendersShared { } } -// Serialized is used only for pretty printing in json. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WalSenderState { - ttid: TenantTimelineId, - addr: SocketAddr, - conn_id: ConnectionId, - // postgres application_name - appname: Option, - feedback: ReplicationFeedback, -} - -// Receiver is either pageserver or regular standby, which have different -// feedbacks. -#[derive(Debug, Clone, Copy, Serialize, Deserialize)] -enum ReplicationFeedback { - Pageserver(PageserverFeedback), - Standby(StandbyFeedback), -} - // id of the occupied slot in WalSenders to access it (and save in the // WalSenderGuard). We could give Arc directly to the slot, but there is not // much sense in that as values aggregation which is performed on each feedback @@ -888,6 +812,7 @@ impl ReplyReader { #[cfg(test)] mod tests { + use safekeeper_api::models::FullTransactionId; use utils::id::{TenantId, TimelineId}; use super::*; diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 941b7e67d0a9..c6ae6c1d2b0e 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -5,7 +5,7 @@ use std::{cmp::max, ops::Deref}; use anyhow::{bail, Result}; use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::TimelineTermBumpResponse; +use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term}; use serde::{Deserialize, Serialize}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -14,10 +14,7 @@ use utils::{ use crate::{ control_file, - safekeeper::{ - AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory, - UNKNOWN_SERVER_VERSION, - }, + safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION}, timeline::TimelineError, wal_backup_partial::{self}, }; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 94d6ef106160..36860a0da2b4 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,8 +4,8 @@ use anyhow::{anyhow, bail, Result}; use camino::{Utf8Path, Utf8PathBuf}; use remote_storage::RemotePath; -use safekeeper_api::models::TimelineTermBumpResponse; -use serde::{Deserialize, Serialize}; +use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse}; +use safekeeper_api::Term; use tokio::fs::{self}; use tokio_util::sync::CancellationToken; use utils::id::TenantId; @@ -31,9 +31,7 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use crate::control_file; use crate::rate_limit::RateLimiter; use crate::receive_wal::WalReceivers; -use crate::safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, Term, TermLsn, -}; +use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn}; use crate::send_wal::WalSenders; use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState}; use crate::timeline_guard::ResidenceGuard; @@ -47,40 +45,17 @@ use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::SafeKeeperConf; use crate::{debug_dump, timeline_manager, wal_storage}; -/// Things safekeeper should know about timeline state on peers. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PeerInfo { - pub sk_id: NodeId, - pub term: Term, - /// Term of the last entry. - pub last_log_term: Term, - /// LSN of the last record. - pub flush_lsn: Lsn, - pub commit_lsn: Lsn, - /// Since which LSN safekeeper has WAL. - pub local_start_lsn: Lsn, - /// When info was received. Serde annotations are not very useful but make - /// the code compile -- we don't rely on this field externally. - #[serde(skip)] - #[serde(default = "Instant::now")] - ts: Instant, - pub pg_connstr: String, - pub http_connstr: String, -} - -impl PeerInfo { - fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { - PeerInfo { - sk_id: NodeId(sk_info.safekeeper_id), - term: sk_info.term, - last_log_term: sk_info.last_log_term, - flush_lsn: Lsn(sk_info.flush_lsn), - commit_lsn: Lsn(sk_info.commit_lsn), - local_start_lsn: Lsn(sk_info.local_start_lsn), - pg_connstr: sk_info.safekeeper_connstr.clone(), - http_connstr: sk_info.http_connstr.clone(), - ts, - } +fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo { + PeerInfo { + sk_id: NodeId(sk_info.safekeeper_id), + term: sk_info.term, + last_log_term: sk_info.last_log_term, + flush_lsn: Lsn(sk_info.flush_lsn), + commit_lsn: Lsn(sk_info.commit_lsn), + local_start_lsn: Lsn(sk_info.local_start_lsn), + pg_connstr: sk_info.safekeeper_connstr.clone(), + http_connstr: sk_info.http_connstr.clone(), + ts, } } @@ -697,7 +672,7 @@ impl Timeline { { let mut shared_state = self.write_shared_state().await; shared_state.sk.record_safekeeper_info(&sk_info).await?; - let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); + let peer_info = peer_info_from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); } Ok(()) diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index c02fb904cf63..a33994dcabaa 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -14,6 +14,7 @@ use std::{ use futures::channel::oneshot; use postgres_ffi::XLogSegNo; +use safekeeper_api::{models::PeerInfo, Term}; use serde::{Deserialize, Serialize}; use tokio::{ task::{JoinError, JoinHandle}, @@ -32,10 +33,9 @@ use crate::{ rate_limit::{rand_duration, RateLimiter}, recovery::recovery_main, remove_wal::calc_horizon_lsn, - safekeeper::Term, send_wal::WalSenders, state::TimelineState, - timeline::{ManagerTimeline, PeerInfo, ReadGuardSharedState, StateSK, WalResidentTimeline}, + timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}, timeline_guard::{AccessService, GuardId, ResidenceGuard}, timelines_set::{TimelineSetGuard, TimelinesSet}, wal_backup::{self, WalBackupTaskHandle}, diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index e1241ceb9b84..ad29c9f66c2c 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -4,7 +4,6 @@ use crate::defaults::DEFAULT_EVICTION_CONCURRENCY; use crate::rate_limit::RateLimiter; -use crate::safekeeper::ServerInfo; use crate::state::TimelinePersistentState; use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; use crate::timelines_set::TimelinesSet; @@ -13,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf}; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; +use safekeeper_api::ServerInfo; use serde::Serialize; use std::collections::HashMap; use std::str::FromStr; diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 34b5dbeaa1cf..8517fa03443c 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -3,6 +3,7 @@ use anyhow::{Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; use futures::stream::FuturesOrdered; use futures::StreamExt; +use safekeeper_api::models::PeerInfo; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -30,7 +31,7 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; -use crate::timeline::{PeerInfo, WalResidentTimeline}; +use crate::timeline::WalResidentTimeline; use crate::timeline_manager::{Manager, StateSnapshot}; use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index bddfca50e4fb..4e5b34a9bf65 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -22,6 +22,7 @@ use camino::Utf8PathBuf; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; +use safekeeper_api::Term; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; @@ -31,7 +32,6 @@ use utils::{id::NodeId, lsn::Lsn}; use crate::{ metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, rate_limit::{rand_duration, RateLimiter}, - safekeeper::Term, timeline::WalResidentTimeline, timeline_manager::StateSnapshot, wal_backup::{self}, diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs index f8c0c502cdbc..aea628c20808 100644 --- a/safekeeper/src/wal_reader_stream.rs +++ b/safekeeper/src/wal_reader_stream.rs @@ -4,12 +4,12 @@ use async_stream::try_stream; use bytes::Bytes; use futures::Stream; use postgres_backend::CopyStreamHandlerEnd; +use safekeeper_api::Term; use std::time::Duration; use tokio::time::timeout; use utils::lsn::Lsn; use crate::{ - safekeeper::Term, send_wal::{EndWatch, WalSenderGuard}, timeline::WalResidentTimeline, }; diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 1ff83918a76c..1ebcb060e776 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,6 +4,7 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; +use safekeeper_api::models::ConnectionId; use std::sync::Arc; use std::time::Duration; use tokio::net::TcpStream; @@ -114,8 +115,6 @@ async fn handle_socket( .await } -/// Unique WAL service connection ids are logged in spans for observability. -pub type ConnectionId = u32; pub type ConnectionCount = u32; pub fn issue_connection_id(count: &mut ConnectionCount) -> ConnectionId { diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 12aa02577185..efcdd89e7da7 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -15,12 +15,13 @@ use desim::{ }; use http::Uri; use safekeeper::{ - safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, + safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION}, state::{TimelinePersistentState, TimelineState}, timeline::TimelineError, wal_storage::Storage, SafeKeeperConf, }; +use safekeeper_api::ServerInfo; use tracing::{debug, info_span, warn}; use utils::{ id::{NodeId, TenantId, TenantTimelineId, TimelineId}, From 2c910628288a29435f102471c6927d3cd24266ba Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 13 Dec 2024 14:52:54 +0000 Subject: [PATCH 04/56] test_prefetch: reduce timeout to default 5m from 10m (#10105) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem `test_prefetch` is flaky (https://github.com/neondatabase/neon/issues/9961), but if it passes, the run time is less than 30 seconds — we don't need an extended timeout for it. ## Summary of changes - Remove extended test timeout for `test_prefetch` --- test_runner/regress/test_prefetch_buffer_resize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py index 7676b78b0e90..99fe80e6218c 100644 --- a/test_runner/regress/test_prefetch_buffer_resize.py +++ b/test_runner/regress/test_prefetch_buffer_resize.py @@ -7,7 +7,6 @@ @pytest.mark.parametrize("shard_count", [None, 4]) -@pytest.mark.timeout(600) def test_prefetch(neon_env_builder: NeonEnvBuilder, shard_count: int | None): if shard_count is not None: neon_env_builder.num_pageservers = shard_count From fcff7528517b47f79a55334c22dc6dc89c113be1 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 13 Dec 2024 17:28:21 +0100 Subject: [PATCH 05/56] fix(test_timeline_archival_chaos): flakiness caused by orphan layers (#10083) The test was failing with the scary but generic message `Remote storage metadata corrupted`. The underlying scrubber error is `Orphan layer detected: ...`. The test kills pageserver at random points, hence it's expected that we leak layers if we're killed in the window after layer upload but before it's referenced from index part. Refer to generation numbers RFC for details. Refs: - fixes https://github.com/neondatabase/neon/issues/9988 - root-cause analysis https://github.com/neondatabase/neon/issues/9988#issuecomment-2520673167 --- test_runner/regress/test_timeline_archive.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index e808dd13966c..addf702893ff 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -435,6 +435,14 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): ] ) + env.storage_scrubber.allowed_errors.extend( + [ + # Unclcean shutdowns of pageserver can legitimately result in orphan layers + # (https://github.com/neondatabase/neon/issues/9988#issuecomment-2520558211) + f".*Orphan layer detected: tenants/{tenant_id}/.*" + ] + ) + class TimelineState: def __init__(self): self.timeline_id = TimelineId.generate() From eeabecd89f89a24fa8ee642c916efd97b4e8fa06 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 13 Dec 2024 19:40:26 +0200 Subject: [PATCH 06/56] Correctly update LFC used_pages in case of LFC resize (#10128) ## Problem LFC used_pages statistic is not updated in case of LFC resize (shrinking `neon.file_cache_size_limit`) ## Summary of changes Update `lfc_ctl->used_pages` in `lfc_change_limit_hook` Co-authored-by: Konstantin Knizhnik --- pgxn/neon/file_cache.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 70b250d3945d..f49415be6869 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -365,6 +365,10 @@ lfc_change_limit_hook(int newval, void *extra) neon_log(LOG, "Failed to punch hole in file: %m"); #endif /* We remove the old entry, and re-enter a hole to the hash table */ + for (int i = 0; i < BLOCKS_PER_CHUNK; i++) + { + lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1; + } hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); memset(&holetag, 0, sizeof(holetag)); From 07d1db54b3f0b9c113aa28fcdb85bacf15979cd9 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 13 Dec 2024 12:10:42 -0600 Subject: [PATCH 07/56] Improve comments and log messages in the logical replication monitor (#9974) Improved comments will help others when they read the code, and the log messages will help others understand why the logical replication monitor works the way it does. Signed-off-by: Tristan Partin --- pgxn/neon/logical_replication_monitor.c | 53 ++++++++++++++----------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c index 5eee5a167911..b94faafdfae9 100644 --- a/pgxn/neon/logical_replication_monitor.c +++ b/pgxn/neon/logical_replication_monitor.c @@ -131,8 +131,8 @@ get_snapshots_cutoff_lsn(void) { cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn; elog(LOG, - "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d", - LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files); + "ls_monitor: number of snapshot files, %zu, is larger than limit of %d", + snapshot_index, logical_replication_max_snap_files); } /* Is the size of the logical snapshots directory larger than specified? @@ -162,8 +162,8 @@ get_snapshots_cutoff_lsn(void) } if (cutoff != original) - elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB", - LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size); + elog(LOG, "ls_monitor: " SNAPDIR " is larger than %d KB", + logical_replication_max_logicalsnapdir_size); } pfree(snapshot_descriptors); @@ -214,9 +214,13 @@ InitLogicalReplicationMonitor(void) } /* - * Unused logical replication slots pins WAL and prevents deletion of snapshots. + * Unused logical replication slots pins WAL and prevent deletion of snapshots. * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which - * need too many .snap files. + * need too many .snap files. These files are stored as AUX files, which are a + * pageserver mechanism for storing non-relation data. AUX files are shipped in + * in the basebackup which is requested by compute_ctl before Postgres starts. + * The larger the time to retrieve the basebackup, the more likely it is the + * compute will be killed by the control plane due to a timeout. */ void LogicalSlotsMonitorMain(Datum main_arg) @@ -239,10 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg) ProcessConfigFile(PGC_SIGHUP); } - /* - * If there are too many .snap files, just drop all logical slots to - * prevent aux files bloat. - */ + /* Get the cutoff LSN */ cutoff_lsn = get_snapshots_cutoff_lsn(); if (cutoff_lsn > 0) { @@ -252,31 +253,37 @@ LogicalSlotsMonitorMain(Datum main_arg) ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; XLogRecPtr restart_lsn; - /* find the name */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); - /* Consider only logical repliction slots */ + + /* Consider only active logical repliction slots */ if (!s->in_use || !SlotIsLogical(s)) { LWLockRelease(ReplicationSlotControlLock); continue; } - /* do we need to drop it? */ + /* + * Retrieve the restart LSN to determine if we need to drop the + * slot + */ SpinLockAcquire(&s->mutex); restart_lsn = s->data.restart_lsn; SpinLockRelease(&s->mutex); + + strlcpy(slot_name, s->data.name.data, sizeof(slot_name)); + LWLockRelease(ReplicationSlotControlLock); + if (restart_lsn >= cutoff_lsn) { - LWLockRelease(ReplicationSlotControlLock); + elog(LOG, "ls_monitor: not dropping replication slot %s because restart LSN %X/%X is greater than cutoff LSN %X/%X", + slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); continue; } - strlcpy(slot_name, s->data.name.data, NAMEDATALEN); - elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X", + elog(LOG, "ls_monitor: dropping replication slot %s because restart LSN %X/%X lower than cutoff LSN %X/%X", slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); - LWLockRelease(ReplicationSlotControlLock); - /* now try to drop it, killing owner before if any */ + /* now try to drop it, killing owner before, if any */ for (;;) { pid_t active_pid; @@ -288,9 +295,9 @@ LogicalSlotsMonitorMain(Datum main_arg) if (active_pid == 0) { /* - * Slot is releasted, try to drop it. Though of course + * Slot is released, try to drop it. Though of course, * it could have been reacquired, so drop can ERROR - * out. Similarly it could have been dropped in the + * out. Similarly, it could have been dropped in the * meanwhile. * * In principle we could remove pg_try/pg_catch, that @@ -300,14 +307,14 @@ LogicalSlotsMonitorMain(Datum main_arg) PG_TRY(); { ReplicationSlotDrop(slot_name, true); - elog(LOG, "ls_monitor: slot %s dropped", slot_name); + elog(LOG, "ls_monitor: replication slot %s dropped", slot_name); } PG_CATCH(); { /* log ERROR and reset elog stack */ EmitErrorReport(); FlushErrorState(); - elog(LOG, "ls_monitor: failed to drop slot %s", slot_name); + elog(LOG, "ls_monitor: failed to drop replication slot %s", slot_name); } PG_END_TRY(); break; @@ -315,7 +322,7 @@ LogicalSlotsMonitorMain(Datum main_arg) else { /* kill the owner and wait for release */ - elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid); + elog(LOG, "ls_monitor: killing replication slot %s owner %d", slot_name, active_pid); (void) kill(active_pid, SIGTERM); /* We shouldn't get stuck, but to be safe add timeout. */ ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP); From 7ee5dca752c9e9b7e65752c2561798bdd91ea3f6 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:22:25 -0500 Subject: [PATCH 08/56] fix(pageserver): race between gc-compaction and repartition (#10127) ## Problem close https://github.com/neondatabase/neon/issues/10124 gc-compaction split_gc_jobs is holding the repartition lock for too long time. ## Summary of changes * Ensure split_gc_compaction_jobs drops the repartition lock once it finishes cloning the structures. * Update comments. --------- Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline.rs | 5 ++++- pageserver/src/tenant/timeline/compaction.rs | 8 +++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b5c707922668..75e268a1b9bc 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4064,8 +4064,11 @@ impl Timeline { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. + // Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for + // gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own + // heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction. return Err(CompactionError::Other(anyhow!( - "repartition() called concurrently, this should not happen" + "repartition() called concurrently, this is rare and a retry should be fine" ))); }; let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 701247194ba4..5e6290729c0c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1821,10 +1821,12 @@ impl Timeline { let mut compact_jobs = Vec::new(); // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning // by estimating the amount of files read for a compaction job. We should also partition on LSN. - let Ok(partition) = self.partitioning.try_lock() else { - bail!("failed to acquire partition lock"); + let ((dense_ks, sparse_ks), _) = { + let Ok(partition) = self.partitioning.try_lock() else { + bail!("failed to acquire partition lock"); + }; + partition.clone() }; - let ((dense_ks, sparse_ks), _) = &*partition; // Truncate the key range to be within user specified compaction range. fn truncate_to( source_start: &Key, From d56fea680ec2f2a741a3383c63a55eadf86ad602 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 13 Dec 2024 19:56:32 +0000 Subject: [PATCH 09/56] CI: always require aws-oicd-role-arn input to be set (#10145) ## Problem `benchmarking` job fails because `aws-oicd-role-arn` input is not set ## Summary of changes: - Set `aws-oicd-role-arn` for `benchmarking job - Always require `aws-oicd-role-arn` to be set - Rename `aws_oicd_role_arn` to `aws-oicd-role-arn` for consistency --- .../actions/allure-report-generate/action.yml | 14 +++--- .../actions/allure-report-store/action.yml | 14 +++--- .github/actions/download/action.yml | 12 ++--- .../actions/run-python-test-set/action.yml | 25 +++++----- .github/actions/save-coverage-data/action.yml | 4 +- .github/actions/upload/action.yml | 4 +- .../workflows/_benchmarking_preparation.yml | 2 +- .github/workflows/_build-and-test-locally.yml | 4 +- .github/workflows/benchmarking.yml | 48 +++++++++---------- .github/workflows/build_and_test.yml | 13 +++-- .github/workflows/cloud-regress.yml | 5 +- .github/workflows/ingest_benchmark.yml | 16 +++---- .github/workflows/periodic_pagebench.yml | 2 +- .github/workflows/pg-clients.yml | 10 ++-- 14 files changed, 88 insertions(+), 85 deletions(-) diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index d6219c31b4a5..d07e3e32e8f2 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -7,10 +7,9 @@ inputs: type: boolean required: false default: false - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true outputs: base-url: @@ -84,12 +83,11 @@ runs: ALLURE_VERSION: 2.27.0 ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 3c83656c8940..8548a886cf34 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -8,10 +8,9 @@ inputs: unique-key: description: 'string to distinguish different results in the same run' required: true - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" @@ -36,12 +35,11 @@ runs: env: REPORT_DIR: ${{ inputs.report-dir }} - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report - name: Upload test results diff --git a/.github/actions/download/action.yml b/.github/actions/download/action.yml index d6b1fac9f70c..14b2ef8eace6 100644 --- a/.github/actions/download/action.yml +++ b/.github/actions/download/action.yml @@ -15,19 +15,17 @@ inputs: prefix: description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false - aws_oicd_role_arn: - description: "the OIDC role arn for aws auth" - required: false - default: "" + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 - name: Download artifact diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index dd5c890f5bbd..9a0261d43045 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -48,10 +48,9 @@ inputs: description: 'benchmark durations JSON' required: false default: '{}' - aws_oicd_role_arn: - description: 'the OIDC role arn to (re-)acquire for allure report upload - if not set call must acquire OIDC role' - required: false - default: '' + aws-oicd-role-arn: + description: 'OIDC role arn to interract with S3' + required: true runs: using: "composite" @@ -62,7 +61,7 @@ runs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Download Neon binaries for the previous release if: inputs.build_type != 'remote' @@ -71,7 +70,7 @@ runs: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon-previous prefix: latest - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Download compatibility snapshot if: inputs.build_type != 'remote' @@ -83,7 +82,7 @@ runs: # The lack of compatibility snapshot (for example, for the new Postgres version) # shouldn't fail the whole job. Only relevant test should fail. skip-if-does-not-exist: true - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Checkout if: inputs.needs_postgres_source == 'true' @@ -221,19 +220,19 @@ runs: # The lack of compatibility snapshot shouldn't fail the job # (for example if we didn't run the test for non build-and-test workflow) skip-if-does-not-exist: true - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - - name: (Re-)configure AWS credentials # necessary to upload reports to S3 after a long-running test - if: ${{ !cancelled() && (inputs.aws_oicd_role_arn != '') }} - uses: aws-actions/configure-aws-credentials@v4 + - uses: aws-actions/configure-aws-credentials@v4 + if: ${{ !cancelled() }} with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 # 1 hour should be more than enough to upload report + - name: Upload test results if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-store with: report-dir: /tmp/test_output/allure/results unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }} - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} diff --git a/.github/actions/save-coverage-data/action.yml b/.github/actions/save-coverage-data/action.yml index 9e3a7cba24f2..1bbea5400fe2 100644 --- a/.github/actions/save-coverage-data/action.yml +++ b/.github/actions/save-coverage-data/action.yml @@ -14,11 +14,11 @@ runs: name: coverage-data-artifact path: /tmp/coverage skip-if-does-not-exist: true # skip if there's no previous coverage to download - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} - name: Upload coverage data uses: ./.github/actions/upload with: name: coverage-data-artifact path: /tmp/coverage - aws_oicd_role_arn: ${{ inputs.aws_oicd_role_arn }} + aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }} diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index 6616d0889933..ac5579ccea61 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -14,7 +14,7 @@ inputs: prefix: description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false - aws_oicd_role_arn: + aws-oicd-role-arn: description: "the OIDC role arn for aws auth" required: false default: "" @@ -61,7 +61,7 @@ runs: uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 - role-to-assume: ${{ inputs.aws_oicd_role_arn }} + role-to-assume: ${{ inputs.aws-oicd-role-arn }} role-duration-seconds: 3600 - name: Upload artifact diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index 371d815fc8a6..fd328586b3c0 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -70,7 +70,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # we create a table that has one row for each database that we want to restore with the status whether the restore is done - name: Create benchmark_restore_status table if it does not exist diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 456399f3c360..4263bacce8d1 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -264,7 +264,7 @@ jobs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact path: /tmp/neon - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # XXX: keep this after the binaries.list is formed, so the coverage can properly work later - name: Merge and upload coverage data @@ -308,7 +308,7 @@ jobs: real_s3_region: eu-central-1 rerun_failed: true pg_version: ${{ matrix.pg_version }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 2d37be883723..bbdcf5ef493f 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -105,7 +105,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -123,7 +123,7 @@ jobs: run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests @@ -153,7 +153,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -205,7 +205,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Run Logical Replication benchmarks uses: ./.github/actions/run-python-test-set @@ -216,7 +216,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -233,7 +233,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 5400 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -245,7 +245,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -407,7 +407,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) @@ -455,7 +455,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -470,7 +470,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -485,7 +485,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -503,7 +503,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -614,7 +614,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -629,7 +629,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -640,7 +640,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -711,7 +711,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -743,7 +743,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 43200 -k test_clickbench pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -757,7 +757,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -822,7 +822,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get Connstring Secret Name run: | @@ -861,7 +861,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_tpch pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -873,7 +873,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} @@ -931,7 +931,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Set up Connection String id: set-up-connstr @@ -963,7 +963,7 @@ jobs: save_perf_report: ${{ env.SAVE_PERF_REPORT }} extra_params: -m remote_cluster --timeout 21600 -k test_user_examples pg_version: ${{ env.DEFAULT_PG_VERSION }} - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -974,7 +974,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index b3556debe3fa..55c4bf08b90d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -336,6 +336,7 @@ jobs: extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} pg_version: v16 + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" @@ -393,7 +394,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -455,14 +456,14 @@ jobs: with: name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Get coverage artifact uses: ./.github/actions/download with: name: coverage-data-artifact path: /tmp/coverage - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Merge coverage data run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge @@ -1279,6 +1280,12 @@ jobs: echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 3600 + - name: Promote compatibility snapshot and Neon artifact env: BUCKET: neon-github-public-dev diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 7b9e434ec3ff..55f42ea5332b 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -79,7 +79,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create a new branch id: create-branch @@ -95,6 +95,7 @@ jobs: test_selection: cloud_regress pg_version: ${{matrix.pg-version}} extra_params: -m remote_cluster + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}} @@ -111,7 +112,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 677303226395..fc33c0a980a6 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -13,7 +13,7 @@ on: # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - cron: '0 9 * * *' # run once a day, timezone is utc workflow_dispatch: # adds ability to run this manually - + defaults: run: shell: bash -euxo pipefail {0} @@ -28,7 +28,7 @@ jobs: strategy: fail-fast: false # allow other variants to continue even if one fails matrix: - target_project: [new_empty_project, large_existing_project] + target_project: [new_empty_project, large_existing_project] permissions: contents: write statuses: write @@ -56,7 +56,7 @@ jobs: with: aws-region: eu-central-1 role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role + role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role - name: Download Neon artifact uses: ./.github/actions/download @@ -64,7 +64,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project if: ${{ matrix.target_project == 'new_empty_project' }} @@ -95,7 +95,7 @@ jobs: project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - name: Initialize Neon project + - name: Initialize Neon project if: ${{ matrix.target_project == 'large_existing_project' }} env: BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }} @@ -123,7 +123,7 @@ jobs: ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;" echo "BENCHMARK_INGEST_TARGET_CONNSTR=${BENCHMARK_INGEST_TARGET_CONNSTR}" >> $GITHUB_ENV - - name: Invoke pgcopydb + - name: Invoke pgcopydb uses: ./.github/actions/run-python-test-set with: build_type: remote @@ -132,7 +132,7 @@ jobs: extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb pg_version: v16 save_perf_report: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }} TARGET_PROJECT_TYPE: ${{ matrix.target_project }} @@ -144,7 +144,7 @@ jobs: run: | export LD_LIBRARY_PATH=${PG_16_LIB_PATH} ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+" - + - name: Delete Neon Project if: ${{ always() && matrix.target_project == 'new_empty_project' }} uses: ./.github/actions/neon-project-delete diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index 049990f17beb..af877029e49a 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -137,7 +137,7 @@ jobs: if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate with: - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Post to a Slack channel if: ${{ github.event.schedule && failure() }} diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 5c999d3810c6..4947907eb068 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -96,7 +96,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -113,6 +113,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} @@ -129,7 +130,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} @@ -163,7 +164,7 @@ jobs: name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project id: create-neon-project @@ -180,6 +181,7 @@ jobs: run_in_parallel: false extra_params: -m remote_cluster pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} @@ -196,7 +198,7 @@ jobs: uses: ./.github/actions/allure-report-generate with: store-test-results-into-db: true - aws_oicd_role_arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} From 2521eba6741f000d71c1f6e0d2d0b279c8465f47 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 13 Dec 2024 22:46:41 +0200 Subject: [PATCH 10/56] Check for invalid down link while prefetching B-Tree leave pages for index-only scan (#9867) ## Problem See #9866 Index-only scan prefetch implementation doesn't take in account that down link may be invalid ## Summary of changes Check that downlink is valid block number Correspondent Postgres PRs: https://github.com/neondatabase/postgres/pull/534 https://github.com/neondatabase/postgres/pull/535 https://github.com/neondatabase/postgres/pull/536 https://github.com/neondatabase/postgres/pull/537 --------- Co-authored-by: Konstantin Knizhnik --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 13ff324150fc..c2f65b320159 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 13ff324150fceaac72920e01742addc053db9462 +Subproject commit c2f65b3201591e02ce45b66731392f98d3388e73 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 8736b10c1d93..f262d631ad47 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 8736b10c1d93d11b9c0489872dd529c4c0f5338f +Subproject commit f262d631ad477a1819e84a183e5a7ef561830085 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 81428621f7c0..97f9fde349c6 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 81428621f7c04aed03671cf80a928e0a36d92505 +Subproject commit 97f9fde349c6de6d573f5ce96db07eca60ce6185 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 01fa3c48664c..010c0ea2eb06 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 01fa3c48664ca030cfb69bb4a350aa9df4691d88 +Subproject commit 010c0ea2eb06afe76485a33c43954cbcf3d99f86 diff --git a/vendor/revisions.json b/vendor/revisions.json index 7329aa437f05..afcb922a5efc 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "01fa3c48664ca030cfb69bb4a350aa9df4691d88" + "010c0ea2eb06afe76485a33c43954cbcf3d99f86" ], "v16": [ "16.6", - "81428621f7c04aed03671cf80a928e0a36d92505" + "97f9fde349c6de6d573f5ce96db07eca60ce6185" ], "v15": [ "15.10", - "8736b10c1d93d11b9c0489872dd529c4c0f5338f" + "f262d631ad477a1819e84a183e5a7ef561830085" ], "v14": [ "14.15", - "13ff324150fceaac72920e01742addc053db9462" + "c2f65b3201591e02ce45b66731392f98d3388e73" ] } From cf161e1556229dae17b5b73e03c3b5781f7bf952 Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Sat, 14 Dec 2024 17:37:13 +0000 Subject: [PATCH 11/56] fix(adapter): password not set in role drop (#10130) ## Problem When entry was dropped and password wasn't set, new entry had uninitialized memory in controlplane adapter Resolves: https://github.com/neondatabase/cloud/issues/14914 ## Summary of changes Initialize password in all cases, add tests. Minor formatting for less indentation --- pgxn/neon/control_plane_connector.c | 41 ++++++++++------------ test_runner/regress/test_ddl_forwarding.py | 29 +++++++++++---- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index b47b22cd20dc..59096a1bc8c0 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -428,6 +428,8 @@ MergeTable() hash_seq_init(&status, old_table->role_table); while ((entry = hash_seq_search(&status)) != NULL) { + RoleEntry * old; + bool found_old = false; RoleEntry *to_write = hash_search( CurrentDdlTable->role_table, entry->name, @@ -435,30 +437,23 @@ MergeTable() NULL); to_write->type = entry->type; - if (entry->password) - to_write->password = entry->password; + to_write->password = entry->password; strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); - if (entry->old_name[0] != '\0') - { - bool found_old = false; - RoleEntry *old = hash_search( - CurrentDdlTable->role_table, - entry->old_name, - HASH_FIND, - &found_old); - - if (found_old) - { - if (old->old_name[0] != '\0') - strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); - else - strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN); - hash_search(CurrentDdlTable->role_table, - entry->old_name, - HASH_REMOVE, - NULL); - } - } + if (entry->old_name[0] == '\0') + continue; + + old = hash_search( + CurrentDdlTable->role_table, + entry->old_name, + HASH_FIND, + &found_old); + if (!found_old) + continue; + strlcpy(to_write->old_name, old->old_name, NAMEDATALEN); + hash_search(CurrentDdlTable->role_table, + entry->old_name, + HASH_REMOVE, + NULL); } hash_destroy(old_table->role_table); } diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index de44bbcbc895..b10e38885e42 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -60,14 +60,12 @@ def ddl_forward_handler( if request.json is None: log.info("Received invalid JSON") return Response(status=400) - json = request.json + json: dict[str, list[str]] = request.json # Handle roles first - if "roles" in json: - for operation in json["roles"]: - handle_role(dbs, roles, operation) - if "dbs" in json: - for operation in json["dbs"]: - handle_db(dbs, roles, operation) + for operation in json.get("roles", []): + handle_role(dbs, roles, operation) + for operation in json.get("dbs", []): + handle_db(dbs, roles, operation) return Response(status=200) @@ -207,6 +205,23 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): ddl.wait() assert ddl.roles == {} + cur.execute("CREATE ROLE bork WITH PASSWORD 'newyork'") + cur.execute("BEGIN") + cur.execute("SAVEPOINT point") + cur.execute("DROP ROLE bork") + cur.execute("COMMIT") + ddl.wait() + assert ddl.roles == {} + + cur.execute("CREATE ROLE bork WITH PASSWORD 'oldyork'") + cur.execute("BEGIN") + cur.execute("SAVEPOINT point") + cur.execute("ALTER ROLE bork PASSWORD NULL") + cur.execute("COMMIT") + cur.execute("DROP ROLE bork") + ddl.wait() + assert ddl.roles == {} + cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'") cur.execute("CREATE DATABASE stork WITH OWNER=bork") cur.execute("ALTER ROLE bork RENAME TO cork") From f3ecd5d76ad8b858b2bfaaabba5018046aca46ac Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 15 Dec 2024 10:45:12 +0100 Subject: [PATCH 12/56] pageserver: revert flush backpressure (#8550) (#10135) ## Problem In #8550, we made the flush loop wait for uploads after every layer. This was to avoid unbounded buildup of uploads, and to reduce compaction debt. However, the approach has several problems: * It prevents upload parallelism. * It prevents flush and upload pipelining. * It slows down ingestion even when there is no need to backpressure. * It does not directly backpressure WAL ingestion (only via `disk_consistent_lsn`), and will build up in-memory layers. * It does not directly backpressure based on compaction debt and read amplification. An alternative solution to these problems is proposed in #8390. In the meanwhile, we revert the change to reduce the impact on ingest throughput. This does reintroduce some risk of unbounded upload/compaction buildup. Until https://github.com/neondatabase/neon/issues/8390, this can be addressed in other ways: * Use `max_replication_apply_lag` (aka `remote_consistent_lsn`), which will more directly limit upload debt. * Shard the tenant, which will spread the flush/upload work across more Pageservers and move the bottleneck to Safekeeper. Touches #10095. ## Summary of changes Remove waiting on the upload queue in the flush loop. --- pageserver/src/metrics.rs | 25 +---------- pageserver/src/tenant/timeline.rs | 38 ++++------------- test_runner/fixtures/metrics.py | 1 - test_runner/regress/test_branching.py | 13 ++---- test_runner/regress/test_remote_storage.py | 48 ---------------------- 5 files changed, 13 insertions(+), 112 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index b4e20cb8b90e..bdbabf3f7511 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -3,7 +3,7 @@ use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; @@ -445,15 +445,6 @@ pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -static FLUSH_WAIT_UPLOAD_TIME: Lazy = Lazy::new(|| { - register_gauge_vec!( - "pageserver_flush_wait_upload_seconds", - "Time spent waiting for preceding uploads during layer flush", - &["tenant_id", "shard_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - static LAST_RECORD_LSN: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_last_record_lsn", @@ -2586,7 +2577,6 @@ pub(crate) struct TimelineMetrics { shard_id: String, timeline_id: String, pub flush_time_histo: StorageTimeMetrics, - pub flush_wait_upload_time_gauge: Gauge, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, @@ -2632,9 +2622,6 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); - let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME - .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) - .unwrap(); let compact_time_histo = StorageTimeMetrics::new( StorageTimeOperation::Compact, &tenant_id, @@ -2780,7 +2767,6 @@ impl TimelineMetrics { shard_id, timeline_id, flush_time_histo, - flush_wait_upload_time_gauge, compact_time_histo, create_images_time_histo, logical_size_histo, @@ -2830,14 +2816,6 @@ impl TimelineMetrics { self.resident_physical_size_gauge.get() } - pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) { - self.flush_wait_upload_time_gauge.add(duration); - crate::metrics::FLUSH_WAIT_UPLOAD_TIME - .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id]) - .unwrap() - .add(duration); - } - pub(crate) fn shutdown(&self) { let was_shutdown = self .shutdown @@ -2855,7 +2833,6 @@ impl TimelineMetrics { let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); - let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 75e268a1b9bc..0416953c1f3a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -144,19 +144,15 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; +use super::config::TenantConf; +use super::remote_timeline_client::index::IndexPart; +use super::remote_timeline_client::RemoteTimelineClient; +use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; +use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; +use super::upload_queue::NotInitialized; +use super::GcError; use super::{ - config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized, - MaybeOffloaded, -}; -use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; -use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; -use super::{ - remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError, - storage_layer::ReadableLayer, -}; -use super::{ - secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, - GcError, + debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, MaybeOffloaded, }; #[cfg(test)] @@ -3897,24 +3893,6 @@ impl Timeline { // release lock on 'layers' }; - // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. - // This makes us refuse ingest until the new layers have been persisted to the remote - let start = Instant::now(); - self.remote_client - .wait_completion() - .await - .map_err(|e| match e { - WaitCompletionError::UploadQueueShutDownOrStopped - | WaitCompletionError::NotInitialized( - NotInitialized::ShuttingDown | NotInitialized::Stopped, - ) => FlushLayerError::Cancelled, - WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { - FlushLayerError::Other(anyhow!(e).into()) - } - })?; - let duration = start.elapsed().as_secs_f64(); - self.metrics.flush_wait_upload_time_gauge_add(duration); - // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c5295360c339..eb3d06b94959 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -170,7 +170,6 @@ def counter(name: str) -> str: "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", "pageserver_valid_lsn_lease_count", - "pageserver_flush_wait_upload_seconds", counter("pageserver_tenant_throttling_count_accounted_start"), counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"), diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 34e4e994cb3c..a4056404f08b 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -19,6 +19,7 @@ from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException +from requests.exceptions import RetryError # Test branch creation @@ -176,11 +177,8 @@ def start_creating_timeline(): env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline) - with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"): - env.endpoints.create_start( - initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2 - ) - ps_http.configure_failpoints(("before-upload-index-pausable", "off")) + with pytest.raises(RuntimeError, match="is not active, state: Loading"): + env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant) finally: env.pageserver.stop(immediate=True) @@ -221,10 +219,7 @@ def start_creating_timeline(): branch_id = TimelineId.generate() - with pytest.raises( - PageserverApiException, - match="Cannot branch off the timeline that's not present in pageserver", - ): + with pytest.raises(RetryError, match="too many 503 error responses"): ps_http.timeline_create( env.pg_version, env.initial_tenant, diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 76a42ef4a2a2..52b6b254aa33 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -784,54 +784,6 @@ def create_in_background(): create_thread.join() -def test_paused_upload_stalls_checkpoint( - neon_env_builder: NeonEnvBuilder, -): - """ - This test checks that checkpoints block on uploads to remote storage. - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start( - initial_tenant_conf={ - # Set a small compaction threshold - "compaction_threshold": "3", - # Disable GC - "gc_period": "0s", - # disable PITR - "pitr_interval": "0s", - } - ) - - env.pageserver.allowed_errors.append( - f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" - ) - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - client = env.pageserver.http_client() - layers_at_creation = client.layer_map_info(tenant_id, timeline_id) - deltas_at_creation = len(layers_at_creation.delta_layers()) - assert ( - deltas_at_creation == 1 - ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle" - - # Make new layer uploads get stuck. - # Note that timeline creation waits for the initial layers to reach remote storage. - # So at this point, the `layers_at_creation` are in remote storage. - client.configure_failpoints(("before-upload-layer-pausable", "pause")) - - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - # Build two tables with some data inside - endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - with pytest.raises(ReadTimeout): - client.timeline_checkpoint(tenant_id, timeline_id, timeout=5) - client.configure_failpoints(("before-upload-layer-pausable", "off")) - - def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): From 117c1b5ddec110677f06dbc769712a722b55f2d3 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 16 Dec 2024 08:03:53 +0200 Subject: [PATCH 13/56] Do not perform prefetch for temp relations (#10146) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1734002916827019 With recent prefetch fixes for pg17 and `effective_io_concurrency=100` pg_regress test stats.sql is failed when set temp_buffers to 100. Stream API will try to lock all this 100 buffers for prefetch. ## Summary of changes Disable such behaviour for temp relations. Postgres PR: https://github.com/neondatabase/postgres/pull/548 Co-authored-by: Konstantin Knizhnik --- vendor/postgres-v17 | 2 +- vendor/revisions.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 010c0ea2eb06..65c4e46baf56 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 010c0ea2eb06afe76485a33c43954cbcf3d99f86 +Subproject commit 65c4e46baf56ec05412c7dd63d62faff0b33dcfb diff --git a/vendor/revisions.json b/vendor/revisions.json index afcb922a5efc..c8db81c73f7e 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,7 +1,7 @@ { "v17": [ "17.2", - "010c0ea2eb06afe76485a33c43954cbcf3d99f86" + "65c4e46baf56ec05412c7dd63d62faff0b33dcfb" ], "v16": [ "16.6", From ebcbc1a4822c566a865ca2dd9138e5af2838c794 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 16 Dec 2024 10:06:08 +0000 Subject: [PATCH 14/56] pageserver: tighten up code around SLRU dir key handling (#10082) ## Problem Changes in #9786 were functionally complete but missed some edges that made testing less robust than it should have been: - `is_key_disposable` didn't consider SLRU dir keys disposable - Timeline `init_empty` was always creating SLRU dir keys on all shards The result was that when we had a bug (https://github.com/neondatabase/neon/pull/10080), it wasn't apparent in tests, because one would only encounter the issue if running on a long-lived timeline with enough compaction to drop the initially created empty SLRU dir keys, _and_ some CLog truncation going on. Closes: https://github.com/neondatabase/cloud/issues/21516 ## Summary of changes - Update is_key_global and init_empty to handle SLRU dir keys properly -- the only functional impact is that we avoid writing some spurious keys in shards >0, but this makes testing much more robust. - Make `test_clog_truncate` explicitly use a sharded tenant The net result is that if one reverts #10080, then tests fail (i.e. this PR is a reproducer for the issue) --- libs/pageserver_api/src/key.rs | 4 ++++ libs/pageserver_api/src/shard.rs | 6 ++++- pageserver/src/pgdatadir_mapping.rs | 29 +++++++++++++---------- test_runner/regress/test_clog_truncate.py | 24 ++++++++++++------- 4 files changed, 42 insertions(+), 21 deletions(-) diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 373329c9b464..f0cd713c38b6 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -565,6 +565,10 @@ impl Key { && self.field5 == 0 && self.field6 == u32::MAX } + + pub fn is_slru_dir_key(&self) -> bool { + slru_dir_kind(self).is_some() + } } #[inline(always)] diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index cf0cd3a46b88..4cc0a739e871 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -173,7 +173,11 @@ impl ShardIdentity { /// Return true if the key should be stored on all shards, not just one. pub fn is_key_global(&self, key: &Key) -> bool { - if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() { + if key.is_slru_block_key() + || key.is_slru_segment_size_key() + || key.is_aux_file_key() + || key.is_slru_dir_key() + { // Special keys that are only stored on shard 0 false } else if key.is_rel_block_key() { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 255bd01e259e..3eaecd3a08f5 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1319,18 +1319,23 @@ impl<'a> DatadirModification<'a> { let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); let empty_dir = Value::Image(buf); - self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); - self.put( - slru_dir_to_key(SlruKind::MultiXactMembers), - empty_dir.clone(), - ); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); - self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); - self.pending_directory_entries - .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); + + // Initialize SLRUs on shard 0 only: creating these on other shards would be + // harmless but they'd just be dropped on later compaction. + if self.tline.tenant_shard_id.is_shard_zero() { + self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.put( + slru_dir_to_key(SlruKind::MultiXactMembers), + empty_dir.clone(), + ); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); + self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); + } Ok(()) } diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index 10027ce6891b..2ae38e6d8887 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -1,18 +1,19 @@ from __future__ import annotations import os -import time from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv -from fixtures.utils import query_scalar +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import query_scalar, wait_until # # Test compute node start after clog truncation # -def test_clog_truncate(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_clog_truncate(neon_env_builder: NeonEnvBuilder): + # Use a multi-sharded tenant because WAL ingest logic is shard-dependent, and + # this test is one of the very few that exercises a CLogTruncate WAL record. + env = neon_env_builder.init_start(initial_tenant_shard_count=2) # set aggressive autovacuum to make sure that truncation will happen config = [ @@ -31,6 +32,7 @@ def test_clog_truncate(neon_simple_env: NeonEnv): endpoint.safe_psql("CREATE EXTENSION neon_test_utils") # Consume many xids to advance clog + log.info("Consuming xids...") with endpoint.cursor() as cur: cur.execute("select test_consume_xids(1000*1000*10);") log.info("xids consumed") @@ -47,11 +49,17 @@ def test_clog_truncate(neon_simple_env: NeonEnv): pg_xact_0000_path = os.path.join(endpoint.pg_xact_dir_path(), "0000") log.info(f"pg_xact_0000_path = {pg_xact_0000_path}") - while os.path.isfile(pg_xact_0000_path): - log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") - time.sleep(5) + def assert_file_removed(): + exists = os.path.isfile(pg_xact_0000_path) + if exists: + log.info(f"file exists. wait for truncation: {pg_xact_0000_path=}") + assert not exists + + log.info("Waiting for truncation...") + wait_until(assert_file_removed) # checkpoint to advance latest lsn + log.info("Checkpointing...") with endpoint.cursor() as cur: cur.execute("CHECKPOINT;") lsn_after_truncation = query_scalar(cur, "select pg_current_wal_insert_lsn()") From 24d658791410d81d9b854ba1c8036068d6467bc2 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 16 Dec 2024 11:15:25 +0000 Subject: [PATCH 15/56] chore(proxy): refactor self-signed config (#10154) ## Problem While reviewing #10152 I found it tricky to actually determine whether the connection used `allow_self_signed_compute` or not. I've tried to remove this setting in the past: * https://github.com/neondatabase/neon/pull/7884 * https://github.com/neondatabase/neon/pull/7437 * https://github.com/neondatabase/cloud/pull/13702 But each time it seems it is used by e2e tests ## Summary of changes The `node_info.allow_self_signed_computes` is always initialised to false, and then sometimes inherits the proxy config value. There's no need this needs to be in the node_info, so removing it and propagating it via `TcpMechansim` is simpler. --- proxy/src/auth/backend/console_redirect.rs | 1 - proxy/src/auth/backend/local.rs | 1 - proxy/src/console_redirect_proxy.rs | 2 +- proxy/src/control_plane/client/cplane_proxy_v1.rs | 1 - proxy/src/control_plane/client/mock.rs | 1 - proxy/src/control_plane/mod.rs | 13 +++---------- proxy/src/proxy/connect_compute.rs | 11 ++++++++--- proxy/src/proxy/mod.rs | 2 +- proxy/src/proxy/tests/mod.rs | 14 ++++++-------- proxy/src/redis/notifications.rs | 2 +- proxy/src/serverless/backend.rs | 2 -- 11 files changed, 20 insertions(+), 30 deletions(-) diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 575d60be8559..c3de77b35278 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -187,7 +187,6 @@ async fn authenticate( NodeInfo { config, aux: db_info.aux, - allow_self_signed_compute: false, // caller may override }, db_info.allowed_ips, )) diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index d4273fb52167..d10f0e82b283 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -37,7 +37,6 @@ impl LocalBackend { branch_id: BranchIdTag::get_interner().get_or_intern("local"), cold_start_info: ColdStartInfo::WarmCached, }, - allow_self_signed_compute: false, }, } } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 65702e0e4c7a..02398fb7778c 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -213,9 +213,9 @@ pub(crate) async fn handle_client( params_compat: true, params: ¶ms, locks: &config.connect_compute_locks, + allow_self_signed_compute: config.allow_self_signed_compute, }, &user_info, - config.allow_self_signed_compute, config.wake_compute_retry_config, config.connect_to_compute_retry_config, ) diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs index e33a37f64366..00038a6ac6a1 100644 --- a/proxy/src/control_plane/client/cplane_proxy_v1.rs +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -250,7 +250,6 @@ impl NeonControlPlaneClient { let node = NodeInfo { config, aux: body.aux, - allow_self_signed_compute: false, }; Ok(node) diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index eaf692ab279b..93edd6547693 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -174,7 +174,6 @@ impl MockControlPlane { branch_id: (&BranchId::from("branch")).into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, - allow_self_signed_compute: false, }; Ok(node) diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index 41972e4e44d0..c0718920b493 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -67,28 +67,21 @@ pub(crate) struct NodeInfo { /// Labels for proxy's metrics. pub(crate) aux: MetricsAuxInfo, - - /// Whether we should accept self-signed certificates (for testing) - pub(crate) allow_self_signed_compute: bool, } impl NodeInfo { pub(crate) async fn connect( &self, ctx: &RequestContext, + allow_self_signed_compute: bool, timeout: Duration, ) -> Result { self.config - .connect( - ctx, - self.allow_self_signed_compute, - self.aux.clone(), - timeout, - ) + .connect(ctx, allow_self_signed_compute, self.aux.clone(), timeout) .await } + pub(crate) fn reuse_settings(&mut self, other: Self) { - self.allow_self_signed_compute = other.allow_self_signed_compute; self.config.reuse_password(other.config); } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index a3027abd7cae..6da4c90a535b 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -73,6 +73,9 @@ pub(crate) struct TcpMechanism<'a> { /// connect_to_compute concurrency lock pub(crate) locks: &'static ApiLocks, + + /// Whether we should accept self-signed certificates (for testing) + pub(crate) allow_self_signed_compute: bool, } #[async_trait] @@ -90,7 +93,11 @@ impl ConnectMechanism for TcpMechanism<'_> { ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; - permit.release_result(node_info.connect(ctx, timeout).await) + permit.release_result( + node_info + .connect(ctx, self.allow_self_signed_compute, timeout) + .await, + ) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { @@ -104,7 +111,6 @@ pub(crate) async fn connect_to_compute Result @@ -117,7 +123,6 @@ where wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; node_info.set_keys(user_info.get_keys()); - node_info.allow_self_signed_compute = allow_self_signed_compute; mechanism.update_connect_config(&mut node_info.config); // try once diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index cc04bc5e5ce9..de0ec0f79998 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -355,9 +355,9 @@ pub(crate) async fn handle_client( params_compat, params: ¶ms, locks: &config.connect_compute_locks, + allow_self_signed_compute: mode.allow_self_signed_compute(config), }, &user_info, - mode.allow_self_signed_compute(config), config.wake_compute_retry_config, config.connect_to_compute_retry_config, ) diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 911b349416f2..3899ba4267ca 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -553,7 +553,6 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn branch_id: (&BranchId::from("branch")).into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, - allow_self_signed_compute: false, }; let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone())); node2.map(|()| node) @@ -588,7 +587,7 @@ async fn connect_to_compute_success() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, config, config) .await .unwrap(); mechanism.verify(); @@ -606,7 +605,7 @@ async fn connect_to_compute_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, config, config) .await .unwrap(); mechanism.verify(); @@ -625,7 +624,7 @@ async fn connect_to_compute_non_retry_1() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, config, config) .await .unwrap_err(); mechanism.verify(); @@ -644,7 +643,7 @@ async fn connect_to_compute_non_retry_2() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, config, config) .await .unwrap(); mechanism.verify(); @@ -674,7 +673,6 @@ async fn connect_to_compute_non_retry_3() { &ctx, &mechanism, &user_info, - false, wake_compute_retry_config, connect_to_compute_retry_config, ) @@ -696,7 +694,7 @@ async fn wake_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, config, config) .await .unwrap(); mechanism.verify(); @@ -715,7 +713,7 @@ async fn wake_non_retry() { max_retries: 5, backoff_factor: 2.0, }; - connect_to_compute(&ctx, &mechanism, &user_info, false, config, config) + connect_to_compute(&ctx, &mechanism, &user_info, config, config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index f3aa97c03284..d18dfd246556 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -6,6 +6,7 @@ use pq_proto::CancelKeyData; use redis::aio::PubSub; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; +use tracing::Instrument; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; @@ -13,7 +14,6 @@ use crate::cache::project_info::ProjectInfoCache; use crate::cancellation::{CancelMap, CancellationHandler}; use crate::intern::{ProjectIdInt, RoleNameInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; -use tracing::Instrument; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 251aa470843d..15d883bdb008 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -195,7 +195,6 @@ impl PoolingBackend { locks: &self.config.connect_compute_locks, }, &backend, - false, // do not allow self signed compute for http flow self.config.wake_compute_retry_config, self.config.connect_to_compute_retry_config, ) @@ -237,7 +236,6 @@ impl PoolingBackend { locks: &self.config.connect_compute_locks, }, &backend, - false, // do not allow self signed compute for http flow self.config.wake_compute_retry_config, self.config.connect_to_compute_retry_config, ) From 1ed0e52bc837fd8eb356847e452444b5df1f4b0b Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 16 Dec 2024 15:07:24 +0300 Subject: [PATCH 16/56] Extract safekeeper http client to separate crate. (#10140) ## Problem We want to use safekeeper http client in storage controller and neon_local. ## Summary of changes Extract it to separate crate. No functional changes. --- Cargo.lock | 13 +++++++++++++ Cargo.toml | 2 ++ safekeeper/Cargo.toml | 1 + safekeeper/client/Cargo.toml | 13 +++++++++++++ safekeeper/client/src/lib.rs | 1 + .../{src/http/client.rs => client/src/mgmt_api.rs} | 4 ---- safekeeper/src/http/mod.rs | 1 - safekeeper/src/pull_timeline.rs | 5 +++-- 8 files changed, 33 insertions(+), 7 deletions(-) create mode 100644 safekeeper/client/Cargo.toml create mode 100644 safekeeper/client/src/lib.rs rename safekeeper/{src/http/client.rs => client/src/mgmt_api.rs} (96%) diff --git a/Cargo.lock b/Cargo.lock index c4f80f63c9be..d1f77469699e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5535,6 +5535,7 @@ dependencies = [ "remote_storage", "reqwest", "safekeeper_api", + "safekeeper_client", "scopeguard", "sd-notify", "serde", @@ -5572,6 +5573,18 @@ dependencies = [ "utils", ] +[[package]] +name = "safekeeper_client" +version = "0.1.0" +dependencies = [ + "reqwest", + "safekeeper_api", + "serde", + "thiserror", + "utils", + "workspace_hack", +] + [[package]] name = "same-file" version = "1.0.6" diff --git a/Cargo.toml b/Cargo.toml index 0654c25a3d67..056cd5798f84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ members = [ "pageserver/pagebench", "proxy", "safekeeper", + "safekeeper/client", "storage_broker", "storage_controller", "storage_controller/client", @@ -233,6 +234,7 @@ postgres_initdb = { path = "./libs/postgres_initdb" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } +safekeeper_client = { path = "./safekeeper/client" } desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. storage_controller_client = { path = "./storage_controller/client" } diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0422c46ab10c..086407603f80 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -55,6 +55,7 @@ postgres_ffi.workspace = true pq_proto.workspace = true remote_storage.workspace = true safekeeper_api.workspace = true +safekeeper_client.workspace = true sha2.workspace = true sd-notify.workspace = true storage_broker.workspace = true diff --git a/safekeeper/client/Cargo.toml b/safekeeper/client/Cargo.toml new file mode 100644 index 000000000000..6c5a52de3acf --- /dev/null +++ b/safekeeper/client/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "safekeeper_client" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +safekeeper_api.workspace = true +thiserror.workspace = true +reqwest = { workspace = true, features = [ "stream" ] } +serde.workspace = true +utils.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/safekeeper/client/src/lib.rs b/safekeeper/client/src/lib.rs new file mode 100644 index 000000000000..3963fd466cc8 --- /dev/null +++ b/safekeeper/client/src/lib.rs @@ -0,0 +1 @@ +pub mod mgmt_api; diff --git a/safekeeper/src/http/client.rs b/safekeeper/client/src/mgmt_api.rs similarity index 96% rename from safekeeper/src/http/client.rs rename to safekeeper/client/src/mgmt_api.rs index 669a9c0ce94b..f78745043a35 100644 --- a/safekeeper/src/http/client.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -2,10 +2,6 @@ //! //! Partially copied from pageserver client; some parts might be better to be //! united. -//! -//! It would be also good to move it out to separate crate, but this needs -//! duplication of internal-but-reported structs like WalSenderState, ServerInfo -//! etc. use reqwest::{IntoUrl, Method, StatusCode}; use safekeeper_api::models::TimelineStatus; diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 7229ccb7390b..d82a713f8a93 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,4 +1,3 @@ -pub mod client; pub mod routes; pub use routes::make_router; diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 00777273cbf9..f2d8e4c85fd7 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -5,6 +5,8 @@ use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use safekeeper_api::{models::TimelineStatus, Term}; +use safekeeper_client::mgmt_api; +use safekeeper_client::mgmt_api::Client; use serde::{Deserialize, Serialize}; use std::{ cmp::min, @@ -22,7 +24,6 @@ use tracing::{error, info, instrument}; use crate::{ control_file::CONTROL_FILE_NAME, debug_dump, - http::client::{self, Client}, state::{EvictionState, TimelinePersistentState}, timeline::{Timeline, WalResidentTimeline}, timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, @@ -419,7 +420,7 @@ pub async fn handle_request( let http_hosts = request.http_hosts.clone(); // Figure out statuses of potential donors. - let responses: Vec> = + let responses: Vec> = futures::future::join_all(http_hosts.iter().map(|url| async { let cclient = Client::new(url.clone(), sk_auth_token.clone()); let info = cclient From c5e3314c6e3a3d693310626a4c9c8deec34931c3 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 16 Dec 2024 17:53:04 +0300 Subject: [PATCH 17/56] Add test restarting compute at WAL page boundary (#10111) ## Problem We've had similar test in test_logical_replication, but then removed it because it wasn't needed to trigger LR related bug. Restarting at WAL page boundary is still a useful test, so add it separately back. ## Summary of changes Add the test. --- test_runner/regress/test_wal_acceptor.py | 56 ++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 23d4f23cdb84..0a8900b351e4 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1090,6 +1090,62 @@ def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): endpoint.safe_psql("SELECT 'works'") +# Test restarting compute at WAL page boundary. +def test_restart_endpoint_wal_page_boundary(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + ep = env.endpoints.create_start("main") + ep.safe_psql("create table t (i int)") + + with ep.cursor() as cur: + # measure how much space logical message takes. Sometimes first attempt + # creates huge message and then it stabilizes, have no idea why. + for _ in range(3): + lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"current_lsn={lsn_before}") + # Non-transactional logical message doesn't write WAL, only XLogInsert's + # it, so use transactional. Which is a bit problematic as transactional + # necessitates commit record. Alternatively we can do smth like + # select neon_xlogflush(pg_current_wal_insert_lsn()); + # but isn't much better + that particular call complains on 'xlog flush + # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips + # page headers. + payload = "blahblah" + cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')") + lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before + logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload) + log.info( + f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}" + ) + + # and write logical message spanning exactly as we want + lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"current_lsn={lsn_before}") + curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + offs = int(curr_lsn) % 8192 + till_page = 8192 - offs + target_lsn = curr_lsn + till_page + payload_len = ( + till_page - logical_message_base - 8 + ) # not sure why 8 is here, it is deduced from experiments + log.info( + f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}, target_lsn {target_lsn}" + ) + + cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')") + supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) + log.info(f"supposedly_page_boundary={supposedly_contrecord_end}") + # The calculations to hit the page boundary are very fuzzy, so just + # ignore test if we fail to reach it. + if not (int(supposedly_contrecord_end) % 8192 == 0): + pytest.skip(f"missed page boundary, bad luck: lsn is {supposedly_contrecord_end}") + + ep.stop(mode="immediate") + ep = env.endpoints.create_start("main") + ep.safe_psql("insert into t values (42)") # should be ok + + # Context manager which logs passed time on exit. class DurationLogger: def __init__(self, desc): From 6565fd4056e0c040f26cc593a03561fe775595ff Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 16 Dec 2024 15:33:21 +0000 Subject: [PATCH 18/56] chore: fix clippy lints 2024-12-06 (#10138) --- libs/desim/src/time.rs | 2 +- .../wal_craft/src/xlog_utils_test.rs | 2 +- .../proxy/tokio-postgres2/src/to_statement.rs | 2 +- libs/remote_storage/src/azure_blob.rs | 4 +-- libs/remote_storage/src/lib.rs | 4 +-- libs/remote_storage/src/local_fs.rs | 4 +-- libs/remote_storage/src/s3_bucket.rs | 4 +-- libs/remote_storage/src/simulate_failures.rs | 4 +-- pageserver/compaction/src/compact_tiered.rs | 2 +- pageserver/compaction/src/identify_levels.rs | 5 ++-- pageserver/compaction/src/interface.rs | 2 +- pageserver/compaction/src/simulator.rs | 2 +- pageserver/src/basebackup.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 8 +++--- pageserver/src/tenant/blob_io.rs | 2 +- pageserver/src/tenant/block_io.rs | 2 +- pageserver/src/tenant/disk_btree.rs | 2 +- pageserver/src/tenant/ephemeral_file.rs | 6 ++--- pageserver/src/tenant/layer_map.rs | 4 +-- .../tenant/remote_timeline_client/download.rs | 4 +-- .../tenant/remote_timeline_client/upload.rs | 4 +-- pageserver/src/tenant/storage_layer.rs | 5 +--- .../src/tenant/storage_layer/delta_layer.rs | 4 +-- .../src/tenant/storage_layer/image_layer.rs | 2 +- .../inmemory_layer/vectored_dio_read.rs | 26 +++++++++---------- pageserver/src/tenant/timeline.rs | 2 +- pageserver/src/tenant/timeline/compaction.rs | 4 +-- safekeeper/src/receive_wal.rs | 2 +- safekeeper/src/safekeeper.rs | 5 +--- storage_controller/src/service.rs | 5 +--- 30 files changed, 58 insertions(+), 68 deletions(-) diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs index 7bb71db95cf4..7ce605bda850 100644 --- a/libs/desim/src/time.rs +++ b/libs/desim/src/time.rs @@ -91,7 +91,7 @@ impl Timing { /// Return true if there is a ready event. fn is_event_ready(&self, queue: &mut BinaryHeap) -> bool { - queue.peek().map_or(false, |x| x.time <= self.now()) + queue.peek().is_some_and(|x| x.time <= self.now()) } /// Clear all pending events. diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 9eb3f0e95abf..4a33dbe25b57 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -81,7 +81,7 @@ fn test_end_of_wal(test_name: &str) { continue; } let mut f = File::options().write(true).open(file.path()).unwrap(); - const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; + static ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE]; f.write_all( &ZEROS[0..min( WAL_SEGMENT_SIZE, diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs index 427f77dd79b2..7e12992728dd 100644 --- a/libs/proxy/tokio-postgres2/src/to_statement.rs +++ b/libs/proxy/tokio-postgres2/src/to_statement.rs @@ -11,7 +11,7 @@ mod private { Query(&'a str), } - impl<'a> ToStatementType<'a> { + impl ToStatementType<'_> { pub async fn into_statement(self, client: &Client) -> Result { match self { ToStatementType::Statement(s) => Ok(s.clone()), diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 32c51bc2add5..19c8251ccd37 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -544,9 +544,9 @@ impl RemoteStorage for AzureBlobStorage { .await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 2a3468f98685..7a864151ecef 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -341,9 +341,9 @@ pub trait RemoteStorage: Send + Sync + 'static { /// If the operation fails because of timeout or cancellation, the root cause of the error will be /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went /// through. - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()>; diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 1a2d421c6618..a8b00173ba51 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -562,9 +562,9 @@ impl RemoteStorage for LocalFs { } } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { for path in paths { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 2891f92d0796..d3f19f0b119a 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -813,9 +813,9 @@ impl RemoteStorage for S3Bucket { .await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Delete; diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 51833c1fe658..63c24beb516d 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -181,9 +181,9 @@ impl RemoteStorage for UnreliableWrapper { self.delete_inner(path, true, cancel).await } - async fn delete_objects<'a>( + async fn delete_objects( &self, - paths: &'a [RemotePath], + paths: &[RemotePath], cancel: &CancellationToken, ) -> anyhow::Result<()> { self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?; diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs index 20f88868f91d..7779ffaf8b8d 100644 --- a/pageserver/compaction/src/compact_tiered.rs +++ b/pageserver/compaction/src/compact_tiered.rs @@ -272,7 +272,7 @@ struct CompactionJob { completed: bool, } -impl<'a, E> LevelCompactionState<'a, E> +impl LevelCompactionState<'_, E> where E: CompactionJobExecutor, { diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs index 1853afffdd9d..e04bd153960f 100644 --- a/pageserver/compaction/src/identify_levels.rs +++ b/pageserver/compaction/src/identify_levels.rs @@ -224,9 +224,8 @@ impl Level { } // recalculate depth if this was the last event at this point - let more_events_at_this_key = events_iter - .peek() - .map_or(false, |next_e| next_e.key == e.key); + let more_events_at_this_key = + events_iter.peek().is_some_and(|next_e| next_e.key == e.key); if !more_events_at_this_key { let mut active_depth = 0; for (_end_lsn, is_image, _idx) in active_set.iter().rev() { diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs index 5bc9b5ca1de9..8ed393a64586 100644 --- a/pageserver/compaction/src/interface.rs +++ b/pageserver/compaction/src/interface.rs @@ -148,7 +148,7 @@ pub trait CompactionDeltaLayer: CompactionLay Self: 'a; /// Return all keys in this delta layer. - fn load_keys<'a>( + fn load_keys( &self, ctx: &E::RequestContext, ) -> impl Future>>> + Send; diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs index 776c537d0308..673b80c313d9 100644 --- a/pageserver/compaction/src/simulator.rs +++ b/pageserver/compaction/src/simulator.rs @@ -143,7 +143,7 @@ impl interface::CompactionLayer for Arc { impl interface::CompactionDeltaLayer for Arc { type DeltaEntry<'a> = MockRecord; - async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result> { + async fn load_keys(&self, _ctx: &MockRequestContext) -> anyhow::Result> { Ok(self.records.clone()) } } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index cae0ffb9805b..e1b5676f464b 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -248,7 +248,7 @@ where } } -impl<'a, W> Basebackup<'a, W> +impl Basebackup<'_, W> where W: AsyncWrite + Send + Sync + Unpin, { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 3eaecd3a08f5..14c7e0d2f86d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1242,7 +1242,7 @@ pub struct DatadirModification<'a> { pending_metadata_bytes: usize, } -impl<'a> DatadirModification<'a> { +impl DatadirModification<'_> { // When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed. @@ -1263,7 +1263,7 @@ impl<'a> DatadirModification<'a> { pub(crate) fn has_dirty_data(&self) -> bool { self.pending_data_batch .as_ref() - .map_or(false, |b| b.has_data()) + .is_some_and(|b| b.has_data()) } /// Set the current lsn @@ -2230,7 +2230,7 @@ impl<'a> DatadirModification<'a> { assert!(!self .pending_data_batch .as_ref() - .map_or(false, |b| b.updates_key(&key))); + .is_some_and(|b| b.updates_key(&key))); } } @@ -2299,7 +2299,7 @@ pub enum Version<'a> { Modified(&'a DatadirModification<'a>), } -impl<'a> Version<'a> { +impl Version<'_> { async fn get( &self, timeline: &Timeline, diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index dd70f6bbff8c..7b55df52a54a 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -35,7 +35,7 @@ pub struct CompressionInfo { pub compressed_size: Option, } -impl<'a> BlockCursor<'a> { +impl BlockCursor<'_> { /// Read a blob into a new buffer. pub async fn read_blob( &self, diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 2bd7f2d619aa..990211f80a92 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -89,7 +89,7 @@ pub(crate) enum BlockReaderRef<'a> { VirtualFile(&'a VirtualFile), } -impl<'a> BlockReaderRef<'a> { +impl BlockReaderRef<'_> { #[inline(always)] async fn read_blk( &self, diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index b302cbc97559..c77342b144d5 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -532,7 +532,7 @@ pub struct DiskBtreeIterator<'a> { >, } -impl<'a> DiskBtreeIterator<'a> { +impl DiskBtreeIterator<'_> { pub async fn next(&mut self) -> Option, u64), DiskBtreeError>> { self.stream.next().await } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index aaec8a4c313a..ba79672bc79d 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -174,11 +174,11 @@ impl EphemeralFile { } impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, dst: tokio_epoll_uring::Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { let submitted_offset = self.buffered_writer.bytes_submitted(); diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 7f15baed10f4..1b6924425c25 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -392,8 +392,8 @@ impl LayerMap { image_layer: Option>, end_lsn: Lsn, ) -> Option { - assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta())); - assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta())); + assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta())); + assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta())); match (delta_layer, image_layer) { (None, None) => None, diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d15f161fb6da..b4d45dca7523 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -145,8 +145,8 @@ pub async fn download_layer_file<'a>( /// /// If Err() is returned, there was some error. The file at `dst_path` has been unlinked. /// The unlinking has _not_ been made durable. -async fn download_object<'a>( - storage: &'a GenericRemoteStorage, +async fn download_object( + storage: &GenericRemoteStorage, src_path: &RemotePath, dst_path: &Utf8PathBuf, #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate, diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 0cd5d05aa276..e434d24e5f9c 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -25,8 +25,8 @@ use utils::id::{TenantId, TimelineId}; use tracing::info; /// Serializes and uploads the given index part data to the remote storage. -pub(crate) async fn upload_index_part<'a>( - storage: &'a GenericRemoteStorage, +pub(crate) async fn upload_index_part( + storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9e3a25cbbc53..b8206fca5a1c 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -345,10 +345,7 @@ impl LayerFringe { } pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { - let read_desc = match self.planned_visits_by_lsn.pop() { - Some(desc) => desc, - None => return None, - }; + let read_desc = self.planned_visits_by_lsn.pop()?; let removed = self.visit_reads.remove_entry(&read_desc.layer_to_visit_id); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index fec8a0a16c50..ade1b794c65d 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1486,7 +1486,7 @@ pub struct ValueRef<'a> { layer: &'a DeltaLayerInner, } -impl<'a> ValueRef<'a> { +impl ValueRef<'_> { /// Loads the value from disk pub async fn load(&self, ctx: &RequestContext) -> Result { let buf = self.load_raw(ctx).await?; @@ -1543,7 +1543,7 @@ pub struct DeltaLayerIterator<'a> { is_end: bool, } -impl<'a> DeltaLayerIterator<'a> { +impl DeltaLayerIterator<'_> { pub(crate) fn layer_dbg_info(&self) -> String { self.delta_layer.layer_dbg_info() } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 834d1931d00f..0d3c9d5a44ca 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -1052,7 +1052,7 @@ pub struct ImageLayerIterator<'a> { is_end: bool, } -impl<'a> ImageLayerIterator<'a> { +impl ImageLayerIterator<'_> { pub(crate) fn layer_dbg_info(&self) -> String { self.image_layer.layer_dbg_info() } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index a4bb3a6bfc5d..1d86015fab1b 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -25,11 +25,11 @@ pub trait File: Send { /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. /// /// No guarantees are made about the remaining bytes in `dst` in case of a short read. - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, dst: Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)>; } @@ -479,11 +479,11 @@ mod tests { } impl File for InMemoryFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, mut dst: Slice, - _ctx: &'a RequestContext, + _ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); let nread = { @@ -609,12 +609,12 @@ mod tests { } } - impl<'x> File for RecorderFile<'x> { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>( - &'b self, + impl File for RecorderFile<'_> { + async fn read_exact_at_eof_ok( + &self, start: u64, dst: Slice, - ctx: &'a RequestContext, + ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?; self.recorded.borrow_mut().push(RecordedRead { @@ -740,11 +740,11 @@ mod tests { } impl File for MockFile { - async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( - &'b self, + async fn read_exact_at_eof_ok( + &self, start: u64, mut dst: Slice, - _ctx: &'a RequestContext, + _ctx: &RequestContext, ) -> std::io::Result<(Slice, usize)> { let ExpectedRead { expect_pos, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0416953c1f3a..87f5a0338252 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -5842,7 +5842,7 @@ enum OpenLayerAction { None, } -impl<'a> TimelineWriter<'a> { +impl TimelineWriter<'_> { async fn handle_open_layer_action( &mut self, at: Lsn, diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 5e6290729c0c..8b6cc8ed8413 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1110,7 +1110,7 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + let same_key = prev_key == Some(key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { let mut next_key_size = 0u64; @@ -2904,7 +2904,7 @@ impl CompactionLayer for ResidentDeltaLayer { impl CompactionDeltaLayer for ResidentDeltaLayer { type DeltaEntry<'a> = DeltaEntry<'a>; - async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result>> { + async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result>> { self.0.get_as_delta(ctx).await?.index_entries(ctx).await } } diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 08371177cd24..3e9ce1da8eb8 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -318,7 +318,7 @@ struct NetworkReader<'a, IO> { global_timelines: Arc, } -impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { +impl NetworkReader<'_, IO> { async fn read_first_message( &mut self, ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index ccd7940c7212..6ceaf325b049 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -125,10 +125,7 @@ impl TermHistory { ); last_common_idx = Some(i); } - let last_common_idx = match last_common_idx { - None => return None, // no common point - Some(lci) => lci, - }; + let last_common_idx = last_common_idx?; // Now find where it ends at both prop and sk and take min. End of // (common) term is the start of the next except it is the last one; // there it is flush_lsn in case of safekeeper or, in case of proposer diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 746177c08978..a89e4741f6ae 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -6873,10 +6873,7 @@ impl Service { let mut plan = Vec::new(); for (node_id, attached) in nodes_by_load { - let available = locked - .nodes - .get(&node_id) - .map_or(false, |n| n.is_available()); + let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available()); if !available { continue; } From 3d30a7a9348efbb0d1faca6d44dccc1885c5b8c9 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 16 Dec 2024 16:54:47 +0100 Subject: [PATCH 19/56] pageserver: make `RemoteTimelineClient::schedule_index_upload` infallible (#10155) Remove an unnecessary `Result` and address a `FIXME`. --- .../src/tenant/remote_timeline_client.rs | 39 ++++++++----------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 20e0536a00e5..fee11bc742bf 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -749,7 +749,7 @@ impl RemoteTimelineClient { // ahead of what's _actually_ on the remote during index upload. upload_queue.dirty.metadata = metadata.clone(); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Ok(()) } @@ -770,7 +770,7 @@ impl RemoteTimelineClient { upload_queue.dirty.metadata.apply(update); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Ok(()) } @@ -809,7 +809,7 @@ impl RemoteTimelineClient { if let Some(archived_at_set) = need_upload_scheduled { let intended_archived_at = archived_at_set.then(|| Utc::now().naive_utc()); upload_queue.dirty.archived_at = intended_archived_at; - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } let need_wait = need_change(&upload_queue.clean.0.archived_at, state).is_some(); @@ -824,7 +824,7 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; upload_queue.dirty.import_pgdata = state; - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Ok(()) } @@ -843,17 +843,14 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } Ok(()) } /// Launch an index-file upload operation in the background (internal function) - fn schedule_index_upload( - self: &Arc, - upload_queue: &mut UploadQueueInitialized, - ) -> Result<(), NotInitialized> { + fn schedule_index_upload(self: &Arc, upload_queue: &mut UploadQueueInitialized) { let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); // fix up the duplicated field upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; @@ -880,7 +877,6 @@ impl RemoteTimelineClient { // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); - Ok(()) } /// Reparent this timeline to a new parent. @@ -909,7 +905,7 @@ impl RemoteTimelineClient { upload_queue.dirty.metadata.reparent(new_parent); upload_queue.dirty.lineage.record_previous_ancestor(&prev); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } @@ -948,7 +944,7 @@ impl RemoteTimelineClient { assert!(prev.is_none(), "copied layer existed already {layer}"); } - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } @@ -1004,7 +1000,7 @@ impl RemoteTimelineClient { upload_queue.dirty.gc_blocking = current .map(|x| x.with_reason(reason)) .or_else(|| Some(index::GcBlocking::started_now_for(reason))); - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } } @@ -1057,8 +1053,7 @@ impl RemoteTimelineClient { upload_queue.dirty.gc_blocking = current.as_ref().and_then(|x| x.without_reason(reason)); assert!(wanted(upload_queue.dirty.gc_blocking.as_ref())); - // FIXME: bogus ? - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); Some(self.schedule_barrier0(upload_queue)) } } @@ -1125,8 +1120,8 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - let with_metadata = self - .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?; + let with_metadata = + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned()); self.schedule_deletion_of_unlinked0(upload_queue, with_metadata); @@ -1153,7 +1148,7 @@ impl RemoteTimelineClient { let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); @@ -1166,7 +1161,7 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Result, NotInitialized> + ) -> Vec<(LayerName, LayerFileMetadata)> where I: IntoIterator, { @@ -1208,10 +1203,10 @@ impl RemoteTimelineClient { // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue)?; + self.schedule_index_upload(upload_queue); } - Ok(with_metadata) + with_metadata } /// Schedules deletion for layer files which have previously been unlinked from the @@ -1302,7 +1297,7 @@ impl RemoteTimelineClient { let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); self.launch_queued_tasks(upload_queue); Ok(()) From 2e4c9c570491b6747e65bdeba64dc55f0dec2ea3 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 16 Dec 2024 16:11:39 +0000 Subject: [PATCH 20/56] chore(proxy): remove allow_self_signed from regular proxy (#10157) I noticed that the only place we use this flag is for testing console redirect proxy. Makes sense to me to make this assumption more explicit. --- proxy/src/proxy/mod.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index de0ec0f79998..5db92d748a5b 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -191,13 +191,6 @@ impl ClientMode { } } - pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { - match self { - ClientMode::Tcp => config.allow_self_signed_compute, - ClientMode::Websockets { .. } => false, - } - } - fn hostname<'a, S>(&'a self, s: &'a Stream) -> Option<&'a str> { match self { ClientMode::Tcp => s.sni_hostname(), @@ -355,7 +348,8 @@ pub(crate) async fn handle_client( params_compat, params: ¶ms, locks: &config.connect_compute_locks, - allow_self_signed_compute: mode.allow_self_signed_compute(config), + // only used for console redirect testing. + allow_self_signed_compute: false, }, &user_info, config.wake_compute_retry_config, From 59b7ff89881ac218f35f545ce38f556d08af36dd Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 16 Dec 2024 16:37:15 +0000 Subject: [PATCH 21/56] chore(proxy): disallow unwrap and unimplemented (#10142) As the title says, I updated the lint rules to no longer allow unwrap or unimplemented. Three special cases: * Tests are allowed to use them * std::sync::Mutex lock().unwrap() is common because it's usually correct to continue panicking on poison * `tokio::spawn_blocking(...).await.unwrap()` is common because it will only error if the blocking fn panics, so continuing the panic is also correct I've introduced two extension traits to help with these last two, that are a bit more explicit so they don't need an expect message every time. --- proxy/src/auth/backend/jwt.rs | 1 + proxy/src/auth/backend/mod.rs | 2 + proxy/src/auth/credentials.rs | 1 + proxy/src/cache/endpoints.rs | 4 +- proxy/src/cache/project_info.rs | 1 + proxy/src/cancellation.rs | 4 +- proxy/src/config.rs | 13 ++---- proxy/src/context/parquet.rs | 14 ++++-- proxy/src/control_plane/client/mock.rs | 4 +- proxy/src/ext.rs | 41 +++++++++++++++++ proxy/src/http/health_server.rs | 7 +-- proxy/src/intern.rs | 3 +- proxy/src/lib.rs | 5 ++- proxy/src/logging.rs | 12 ++++- proxy/src/parse.rs | 15 ------- proxy/src/protocol2.rs | 1 + proxy/src/proxy/copy_bidirectional.rs | 1 + proxy/src/proxy/mod.rs | 2 +- proxy/src/proxy/tests/mod.rs | 1 + proxy/src/rate_limiter/leaky_bucket.rs | 2 +- .../src/rate_limiter/limit_algorithm/aimd.rs | 1 + proxy/src/rate_limiter/limiter.rs | 4 +- .../connection_with_credentials_provider.rs | 6 ++- proxy/src/sasl/messages.rs | 5 ++- proxy/src/scram/messages.rs | 1 + proxy/src/scram/mod.rs | 1 + proxy/src/scram/secret.rs | 1 + proxy/src/scram/threadpool.rs | 16 +++---- proxy/src/serverless/backend.rs | 6 ++- proxy/src/serverless/conn_pool.rs | 34 ++++++-------- proxy/src/serverless/conn_pool_lib.rs | 45 +++++++++---------- proxy/src/serverless/http_util.rs | 11 +++-- proxy/src/serverless/json.rs | 6 ++- proxy/src/serverless/local_conn_pool.rs | 11 ++--- proxy/src/serverless/mod.rs | 5 ++- proxy/src/serverless/sql_over_http.rs | 1 + proxy/src/serverless/websocket.rs | 1 + proxy/src/url.rs | 1 + proxy/src/usage_metrics.rs | 1 + 39 files changed, 178 insertions(+), 113 deletions(-) create mode 100644 proxy/src/ext.rs diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index a258090b1582..df716f8455f0 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -776,6 +776,7 @@ impl From<&jose_jwk::Key> for KeyType { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::future::IntoFuture; use std::net::SocketAddr; diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index f38ecf715f7d..50cb94bfa069 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -463,6 +463,8 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { #[cfg(test)] mod tests { + #![allow(clippy::unimplemented, clippy::unwrap_used)] + use std::net::IpAddr; use std::sync::Arc; use std::time::Duration; diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index f6bce9f2d8aa..eff49a402aaa 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -250,6 +250,7 @@ fn project_name_valid(name: &str) -> bool { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use serde_json::json; use ComputeUserInfoParseError::*; diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 20db1fbb147a..0136446d6dfb 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -12,6 +12,7 @@ use tracing::info; use crate::config::EndpointCacheConfig; use crate::context::RequestContext; +use crate::ext::LockExt; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; use crate::rate_limiter::GlobalRateLimiter; @@ -96,7 +97,7 @@ impl EndpointsCache { // If the limiter allows, we can pretend like it's valid // (incase it is, due to redis channel lag). - if self.limiter.lock().unwrap().check() { + if self.limiter.lock_propagate_poison().check() { return true; } @@ -258,6 +259,7 @@ impl EndpointsCache { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 84430dc812e9..cab0b8b90594 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -365,6 +365,7 @@ impl Cache for ProjectInfoCacheImpl { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; use crate::scram::ServerSecret; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index ed717507ee40..dd3edd6abc8f 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -13,6 +13,7 @@ use uuid::Uuid; use crate::auth::{check_peer_addr_is_in_list, IpPattern}; use crate::error::ReportableError; +use crate::ext::LockExt; use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::cancellation_publisher::{ @@ -114,7 +115,7 @@ impl CancellationHandler

{ IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), }; - if !self.limiter.lock().unwrap().check(subnet_key, 1) { + if !self.limiter.lock_propagate_poison().check(subnet_key, 1) { // log only the subnet part of the IP address to know which subnet is rate limited tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}"); Metrics::get() @@ -283,6 +284,7 @@ impl

Drop for Session

{ } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 8bc8e3f96f59..1f991d595eca 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -221,15 +221,10 @@ impl CertResolver { ) -> anyhow::Result<()> { let priv_key = { let key_bytes = std::fs::read(key_path) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .unwrap() - .context(format!("Failed to parse TLS keys at '{key_path}'"))?, - ) + .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + rustls_pemfile::private_key(&mut &key_bytes[..]) + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? }; let cert_chain_bytes = std::fs::read(cert_path) diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 3105d085260d..5f65b17374f2 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -23,6 +23,7 @@ use utils::backoff; use super::{RequestContextInner, LOG_CHAN}; use crate::config::remote_storage_from_toml; use crate::context::LOG_CHAN_DISCONNECT; +use crate::ext::TaskExt; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -171,7 +172,9 @@ pub async fn worker( }; let (tx, mut rx) = mpsc::unbounded_channel(); - LOG_CHAN.set(tx.downgrade()).unwrap(); + LOG_CHAN + .set(tx.downgrade()) + .expect("only one worker should set the channel"); // setup row stream that will close on cancellation let cancellation_token2 = cancellation_token.clone(); @@ -207,7 +210,9 @@ pub async fn worker( config.parquet_upload_disconnect_events_remote_storage { let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel(); - LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap(); + LOG_CHAN_DISCONNECT + .set(tx_disconnect.downgrade()) + .expect("only one worker should set the channel"); // setup row stream that will close on cancellation tokio::spawn(async move { @@ -326,7 +331,7 @@ where Ok::<_, parquet::errors::ParquetError>((rows, w, rg_meta)) }) .await - .unwrap()?; + .propagate_task_panic()?; rows.clear(); Ok((rows, w, rg_meta)) @@ -352,7 +357,7 @@ async fn upload_parquet( Ok((buffer, metadata)) }) .await - .unwrap()?; + .propagate_task_panic()?; let data = buffer.split().freeze(); @@ -409,6 +414,7 @@ async fn upload_parquet( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::net::Ipv4Addr; use std::num::NonZeroUsize; diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 93edd6547693..5f8bda0f35ae 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -102,7 +102,9 @@ impl MockControlPlane { Some(s) => { info!("got allowed_ips: {s}"); s.split(',') - .map(|s| IpPattern::from_str(s).unwrap()) + .map(|s| { + IpPattern::from_str(s).expect("mocked ip pattern should be correct") + }) .collect() } None => vec![], diff --git a/proxy/src/ext.rs b/proxy/src/ext.rs new file mode 100644 index 000000000000..8d00afbf51a4 --- /dev/null +++ b/proxy/src/ext.rs @@ -0,0 +1,41 @@ +use std::panic::resume_unwind; +use std::sync::{Mutex, MutexGuard}; + +use tokio::task::JoinError; + +pub(crate) trait LockExt { + fn lock_propagate_poison(&self) -> MutexGuard<'_, T>; +} + +impl LockExt for Mutex { + /// Lock the mutex and panic if the mutex was poisoned. + #[track_caller] + fn lock_propagate_poison(&self) -> MutexGuard<'_, T> { + match self.lock() { + Ok(guard) => guard, + // poison occurs when another thread panicked while holding the lock guard. + // since panicking is often unrecoverable, propagating the poison panic is reasonable. + Err(poison) => panic!("{poison}"), + } + } +} + +pub(crate) trait TaskExt { + fn propagate_task_panic(self) -> T; +} + +impl TaskExt for Result { + /// Unwrap the result and panic if the inner task panicked. + /// Also panics if the task was cancelled + #[track_caller] + fn propagate_task_panic(self) -> T { + match self { + Ok(t) => t, + // Using resume_unwind prevents the panic hook being called twice. + // Since we use this for structured concurrency, there is only + // 1 logical panic, so this is more correct. + Err(e) if e.is_panic() => resume_unwind(e.into_panic()), + Err(e) => panic!("unexpected task error: {e}"), + } + } +} diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 978ad9f76131..6ca091feb716 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -14,6 +14,7 @@ use utils::http::error::ApiError; use utils::http::json::json_response; use utils::http::{RouterBuilder, RouterService}; +use crate::ext::{LockExt, TaskExt}; use crate::jemalloc; async fn status_handler(_: Request) -> Result, ApiError> { @@ -76,7 +77,7 @@ async fn prometheus_metrics_handler( let body = tokio::task::spawn_blocking(move || { let _span = span.entered(); - let mut state = state.lock().unwrap(); + let mut state = state.lock_propagate_poison(); let PrometheusHandler { encoder, metrics } = &mut *state; metrics @@ -94,13 +95,13 @@ async fn prometheus_metrics_handler( body }) .await - .unwrap(); + .propagate_task_panic(); let response = Response::builder() .status(200) .header(CONTENT_TYPE, "text/plain; version=0.0.4") .body(Body::from(body)) - .unwrap(); + .expect("response headers should be valid"); Ok(response) } diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index f56d92a6b31e..79c6020302af 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -83,7 +83,7 @@ impl StringInterner { pub(crate) fn new() -> Self { StringInterner { inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( - Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()), + Capacity::new(2500, NonZeroUsize::new(1 << 16).expect("value is nonzero")), // unbounded MemoryLimits::for_memory_usage(usize::MAX), BuildHasherDefault::::default(), @@ -207,6 +207,7 @@ impl From for ProjectIdInt { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::sync::OnceLock; diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index ba69f9cf2d28..a5a72f26d950 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -22,8 +22,8 @@ clippy::string_add, clippy::string_to_string, clippy::todo, - // TODO: consider clippy::unimplemented - // TODO: consider clippy::unwrap_used + clippy::unimplemented, + clippy::unwrap_used, )] // List of permanently allowed lints. #![allow( @@ -82,6 +82,7 @@ pub mod console_redirect_proxy; pub mod context; pub mod control_plane; pub mod error; +mod ext; pub mod http; pub mod intern; pub mod jemalloc; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 74d2b9a1d01e..41f10f052ffa 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -18,8 +18,16 @@ pub async fn init() -> anyhow::Result { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy() - .add_directive("aws_config=info".parse().unwrap()) - .add_directive("azure_core::policies::transport=off".parse().unwrap()); + .add_directive( + "aws_config=info" + .parse() + .expect("this should be a valid filter directive"), + ) + .add_directive( + "azure_core::policies::transport=off" + .parse() + .expect("this should be a valid filter directive"), + ); let fmt_layer = tracing_subscriber::fmt::layer() .with_ansi(false) diff --git a/proxy/src/parse.rs b/proxy/src/parse.rs index 8c0f25106662..095d6278cc51 100644 --- a/proxy/src/parse.rs +++ b/proxy/src/parse.rs @@ -8,14 +8,6 @@ pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> { Some((cstr, other)) } -/// See . -pub(crate) fn split_at_const(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> { - (bytes.len() >= N).then(|| { - let (head, tail) = bytes.split_at(N); - (head.try_into().unwrap(), tail) - }) -} - #[cfg(test)] mod tests { use super::*; @@ -33,11 +25,4 @@ mod tests { assert_eq!(cstr.to_bytes(), b"foo"); assert_eq!(rest, b"bar"); } - - #[test] - fn test_split_at_const() { - assert!(split_at_const::<0>(b"").is_some()); - assert!(split_at_const::<1>(b"").is_none()); - assert!(matches!(split_at_const::<1>(b"ok"), Some((b"o", b"k")))); - } } diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 33a5eb5e1e03..0dc97b709724 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -396,6 +396,7 @@ impl NetworkEndianIpv6 { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use tokio::io::AsyncReadExt; diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 4e4af8863484..3336a9556a5b 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -257,6 +257,7 @@ impl CopyBuffer { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use tokio::io::AsyncWriteExt; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 5db92d748a5b..4e5ecda237d5 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -488,7 +488,7 @@ impl NeonOptions { pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> { static RE: OnceCell = OnceCell::new(); - let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap()); + let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").expect("regex should be correct")); let cap = re.captures(bytes)?; let (_, [k, v]) = cap.extract(); diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 3899ba4267ca..95c518fed9c2 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -1,4 +1,5 @@ //! A group of high-level tests for connection establishing logic and auth. +#![allow(clippy::unimplemented, clippy::unwrap_used)] mod mitm; diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index 45f9630dde0f..bff800f0a2f0 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -83,7 +83,7 @@ impl From for utils::leaky_bucket::LeakyBucketConfig { } #[cfg(test)] -#[allow(clippy::float_cmp)] +#[allow(clippy::float_cmp, clippy::unwrap_used)] mod tests { use std::time::Duration; diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 3000cc4c2af2..04e136b6d543 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -63,6 +63,7 @@ impl LimitAlgorithm for Aimd { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::time::Duration; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index a048721e77d7..6f6a8c9d4781 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -12,6 +12,7 @@ use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; +use crate::ext::LockExt; use crate::intern::EndpointIdInt; pub struct GlobalRateLimiter { @@ -246,12 +247,13 @@ impl BucketRateLimiter { let n = self.map.shards().len(); // this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide // (impossible, infact, unless we have 2048 threads) - let shard = self.rand.lock().unwrap().gen_range(0..n); + let shard = self.rand.lock_propagate_poison().gen_range(0..n); self.map.shards()[shard].write().clear(); } } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::hash::BuildHasherDefault; use std::time::Duration; diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 82139ea1d5e5..0f6e765b02cd 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -69,7 +69,11 @@ impl ConnectionWithCredentialsProvider { pub fn new_with_static_credentials(params: T) -> Self { Self { - credentials: Credentials::Static(params.into_connection_info().unwrap()), + credentials: Credentials::Static( + params + .into_connection_info() + .expect("static configured redis credentials should be a valid format"), + ), con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 1373dfba3d9a..4922ece61531 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -2,7 +2,7 @@ use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; -use crate::parse::{split_at_const, split_cstr}; +use crate::parse::split_cstr; /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] @@ -19,7 +19,7 @@ impl<'a> FirstMessage<'a> { let (method_cstr, tail) = split_cstr(bytes)?; let method = method_cstr.to_str().ok()?; - let (len_bytes, bytes) = split_at_const(tail)?; + let (len_bytes, bytes) = tail.split_first_chunk()?; let len = u32::from_be_bytes(*len_bytes) as usize; if len != bytes.len() { return None; @@ -51,6 +51,7 @@ impl<'a> ServerMessage<&'a str> { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index 5ee3a513527d..0e54e7ded9a7 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -185,6 +185,7 @@ impl fmt::Debug for OwnedServerFirstMessage { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index 718445f61d48..b49a9f32eec4 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -57,6 +57,7 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::threadpool::ThreadPool; use super::{Exchange, ServerSecret}; diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 8c6a08d432d8..eb21b26ab40e 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -72,6 +72,7 @@ impl ServerSecret { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index ebc6dd2a3cef..8f1684c75b0e 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -33,14 +33,11 @@ thread_local! { } impl ThreadPool { - pub fn new(n_workers: u8) -> Arc { + pub fn new(mut n_workers: u8) -> Arc { // rayon would be nice here, but yielding in rayon does not work well afaict. if n_workers == 0 { - return Arc::new(Self { - runtime: None, - metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), - }); + n_workers = 1; } Arc::new_cyclic(|pool| { @@ -66,7 +63,7 @@ impl ThreadPool { }); }) .build() - .unwrap(); + .expect("password threadpool runtime should be configured correctly"); Self { runtime: Some(runtime), @@ -79,7 +76,7 @@ impl ThreadPool { JobHandle( self.runtime .as_ref() - .unwrap() + .expect("runtime is always set") .spawn(JobSpec { pbkdf2, endpoint }), ) } @@ -87,7 +84,10 @@ impl ThreadPool { impl Drop for ThreadPool { fn drop(&mut self) { - self.runtime.take().unwrap().shutdown_background(); + self.runtime + .take() + .expect("runtime is always set") + .shutdown_background(); } } diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 15d883bdb008..449d50b6e78b 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -268,7 +268,11 @@ impl PoolingBackend { if !self.local_pool.initialized(&conn_info) { // only install and grant usage one at a time. - let _permit = local_backend.initialize.acquire().await.unwrap(); + let _permit = local_backend + .initialize + .acquire() + .await + .expect("semaphore should never be closed"); // check again for race if !self.local_pool.initialized(&conn_info) { diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index cac5a173cb16..447103edce53 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -186,8 +186,8 @@ impl ClientDataRemote { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { - use std::mem; use std::sync::atomic::AtomicBool; use super::*; @@ -269,39 +269,33 @@ mod tests { assert_eq!(0, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); assert_eq!(1, pool.get_global_connections_count()); } { - let mut closed_client = Client::new( + let closed_client = Client::new( create_inner_with(MockClient::new(true)), conn_info.clone(), ep_pool.clone(), ); - closed_client.do_drop().unwrap()(); - mem::forget(closed_client); // drop the client - // The closed client shouldn't be added to the pool. + drop(closed_client); assert_eq!(1, pool.get_global_connections_count()); } let is_closed: Arc = Arc::new(false.into()); { - let mut client = Client::new( + let client = Client::new( create_inner_with(MockClient(is_closed.clone())), conn_info.clone(), ep_pool.clone(), ); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client - + drop(client); // The client should be added to the pool. assert_eq!(2, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info, ep_pool); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info, ep_pool); + drop(client); // The client shouldn't be added to the pool. Because the ep-pool is full. assert_eq!(2, pool.get_global_connections_count()); @@ -319,15 +313,13 @@ mod tests { &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), ); { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); assert_eq!(3, pool.get_global_connections_count()); } { - let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); - client.do_drop().unwrap()(); - mem::forget(client); // drop the client + let client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + drop(client); // The client shouldn't be added to the pool. Because the global pool is full. assert_eq!(3, pool.get_global_connections_count()); diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 2a46c8f9c5cf..44eac77e8f94 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -187,19 +187,22 @@ impl EndpointConnPool { pub(crate) fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInnerCommon) { let conn_id = client.get_conn_id(); - let pool_name = pool.read().get_name().to_string(); + let (max_conn, conn_count, pool_name) = { + let pool = pool.read(); + ( + pool.global_pool_size_max_conns, + pool.global_connections_count + .load(atomic::Ordering::Relaxed), + pool.get_name().to_string(), + ) + }; + if client.inner.is_closed() { info!(%conn_id, "{}: throwing away connection '{conn_info}' because connection is closed", pool_name); return; } - let global_max_conn = pool.read().global_pool_size_max_conns; - if pool - .read() - .global_connections_count - .load(atomic::Ordering::Relaxed) - >= global_max_conn - { + if conn_count >= max_conn { info!(%conn_id, "{}: throwing away connection '{conn_info}' because pool is full", pool_name); return; } @@ -633,35 +636,29 @@ impl Client { } pub(crate) fn metrics(&self) -> Arc { - let aux = &self.inner.as_ref().unwrap().aux; + let aux = &self + .inner + .as_ref() + .expect("client inner should not be removed") + .aux; USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, }) } +} - pub(crate) fn do_drop(&mut self) -> Option> { +impl Drop for Client { + fn drop(&mut self) { let conn_info = self.conn_info.clone(); let client = self .inner .take() .expect("client inner should not be removed"); if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { - let current_span = self.span.clone(); + let _current_span = self.span.enter(); // return connection to the pool - return Some(move || { - let _span = current_span.enter(); - EndpointConnPool::put(&conn_pool, &conn_info, client); - }); - } - None - } -} - -impl Drop for Client { - fn drop(&mut self) { - if let Some(drop) = self.do_drop() { - tokio::task::spawn_blocking(drop); + EndpointConnPool::put(&conn_pool, &conn_info, client); } } } diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index c0208d4f68f1..d5c948777cae 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -81,11 +81,14 @@ impl HttpErrorBody { .header(http::header::CONTENT_TYPE, "application/json") // we do not have nested maps with non string keys so serialization shouldn't fail .body( - Full::new(Bytes::from(serde_json::to_string(self).unwrap())) - .map_err(|x| match x {}) - .boxed(), + Full::new(Bytes::from( + serde_json::to_string(self) + .expect("serialising HttpErrorBody should never fail"), + )) + .map_err(|x| match x {}) + .boxed(), ) - .unwrap() + .expect("content-type header should be valid") } } diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 25b25c66d3fb..ab012bd020f1 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -204,7 +204,10 @@ fn pg_array_parse_inner( if c == '\\' { escaped = true; - (i, c) = pg_array_chr.next().unwrap(); + let Some(x) = pg_array_chr.next() else { + return Err(JsonConversionError::UnbalancedArray); + }; + (i, c) = x; } match c { @@ -253,6 +256,7 @@ fn pg_array_parse_inner( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use serde_json::json; diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index b84cde9e252a..c51a2bc9babb 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -179,7 +179,6 @@ pub(crate) fn poll_client( info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = Arc::downgrade(&global_pool); - let pool_clone = pool.clone(); let db_user = conn_info.db_and_user(); let idle = global_pool.get_idle_timeout(); @@ -273,11 +272,7 @@ pub(crate) fn poll_client( }), }; - Client::new( - inner, - conn_info, - Arc::downgrade(&pool_clone.upgrade().unwrap().global_pool), - ) + Client::new(inner, conn_info, Arc::downgrade(&global_pool.global_pool)) } impl ClientInnerCommon { @@ -321,7 +316,8 @@ fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result(buffer.format(jti)).unwrap(); + let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)) + .expect("itoa formatted integer should be guaranteed valid json"); // update the jti in-place let payload = @@ -368,6 +364,7 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use p256::ecdsa::SigningKey; use typed_json::json; diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 80b42f9e5534..c2623e0ecae3 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -46,6 +46,7 @@ use utils::http::error::ApiError; use crate::cancellation::CancellationHandlerMain; use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; +use crate::ext::TaskExt; use crate::metrics::Metrics; use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo}; use crate::proxy::run_until_cancelled; @@ -84,7 +85,7 @@ pub async fn task_main( cancellation_token.cancelled().await; tokio::task::spawn_blocking(move || conn_pool.shutdown()) .await - .unwrap(); + .propagate_task_panic(); } }); @@ -104,7 +105,7 @@ pub async fn task_main( cancellation_token.cancelled().await; tokio::task::spawn_blocking(move || http_conn_pool.shutdown()) .await - .unwrap(); + .propagate_task_panic(); } }); diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5e85f5ec4019..3e42787a0964 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1110,6 +1110,7 @@ impl Discard<'_> { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index bdb83fe6be05..812fedaf0422 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -178,6 +178,7 @@ pub(crate) async fn serve_websocket( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::pin::pin; diff --git a/proxy/src/url.rs b/proxy/src/url.rs index 270cd7c24da0..d73a84057ae3 100644 --- a/proxy/src/url.rs +++ b/proxy/src/url.rs @@ -50,6 +50,7 @@ impl std::fmt::Display for ApiUrl { } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use super::*; diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 65e74466f2ec..487504d709ed 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -407,6 +407,7 @@ async fn upload_backup_events( } #[cfg(test)] +#[expect(clippy::unwrap_used)] mod tests { use std::fs; use std::io::BufReader; From 28ccda0a63f9661d961dbb1e5372f4a78d80346d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 16 Dec 2024 18:10:55 +0100 Subject: [PATCH 22/56] test_runner: ignore error in `test_timeline_archival_chaos` (#10161) Resolves #10159. --- test_runner/regress/test_timeline_archive.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index addf702893ff..87579f9e9280 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -426,6 +426,7 @@ def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder): [ ".*removing local file.*because it has unexpected length.*", ".*__temp.*", + ".*method=POST path=\\S+/timeline .*: Not activating a Stopping timeline.*", # FIXME: there are still anyhow::Error paths in timeline creation/deletion which # generate 500 results when called during shutdown (https://github.com/neondatabase/neon/issues/9768) ".*InternalServerError.*", From aa7ab9b3aca4c399e6caa5fd8fb31e6f0a7723b0 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Mon, 16 Dec 2024 19:56:24 +0100 Subject: [PATCH 23/56] proxy: Allow dumping TLS session keys for debugging (#10163) ## Problem To debug issues with TLS connections there's no easy way to decrypt packets unless a client has special support for logging the keys. ## Summary of changes Add TLS session keys logging to proxy via `SSLKEYLOGFILE` env var gated by flag. --- proxy/src/bin/proxy.rs | 4 ++++ proxy/src/config.rs | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 97c4037009da..e90555e250b8 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -105,6 +105,9 @@ struct ProxyCliArgs { /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'c', long, alias = "ssl-cert")] tls_cert: Option, + /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. + #[clap(long, alias = "allow-ssl-keylogfile")] + allow_tls_keylogfile: bool, /// path to directory with TLS certificates for client postgres connections #[clap(long)] certs_dir: Option, @@ -555,6 +558,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { key_path, cert_path, args.certs_dir.as_ref(), + args.allow_tls_keylogfile, )?), (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1f991d595eca..debd77ac3296 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -95,6 +95,7 @@ pub fn configure_tls( key_path: &str, cert_path: &str, certs_dir: Option<&String>, + allow_tls_keylogfile: bool, ) -> anyhow::Result { let mut cert_resolver = CertResolver::new(); @@ -135,6 +136,11 @@ pub fn configure_tls( config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; + if allow_tls_keylogfile { + // KeyLogFile will check for the SSLKEYLOGFILE environment variable. + config.key_log = Arc::new(rustls::KeyLogFile::new()); + } + Ok(TlsConfig { config: Arc::new(config), common_names, From e226d7a3d1901c1a89f640c8d95b2efd057628b1 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Tue, 17 Dec 2024 09:16:54 +0100 Subject: [PATCH 24/56] Fix docker compose with PG17 (#10165) ## Problem It's impossible to run docker compose with compute v17 due to `pg_anon` extension which is not supported under PG17. ## Summary of changes The auto-loading of `pg_anon` is disabled by default --- .../compute_wrapper/var/db/postgres/specs/spec.json | 5 ----- docker-compose/docker_compose_test.sh | 12 ++++++------ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index 8e582e74e15d..0308cab4515a 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -132,11 +132,6 @@ "name": "cron.database", "value": "postgres", "vartype": "string" - }, - { - "name": "session_preload_libraries", - "value": "anon", - "vartype": "string" } ] }, diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index c97dfaa901e8..063664d0c67d 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -35,11 +35,11 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do echo "clean up containers if exists" cleanup PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version)) - # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option - if [ $pg_version -eq 17 ]; then + # The support of pg_anon not yet added to PG17, so we have to add the corresponding option for other PG versions + if [ "${pg_version}" -ne 17 ]; then SPEC_PATH="compute_wrapper/var/db/postgres/specs" mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak - jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json + jq '.cluster.settings += [{"name": "session_preload_libraries","value": "anon","vartype": "string"}]' "${SPEC_PATH}/spec.bak" > "${SPEC_PATH}/spec.json" fi PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d @@ -106,8 +106,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do fi fi cleanup - # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option - if [ $pg_version -eq 17 ]; then - mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json + # Restore the original spec.json + if [ "$pg_version" -ne 17 ]; then + mv "$SPEC_PATH/spec.bak" "$SPEC_PATH/spec.json" fi done From b0e43c2f88bf81473c931e3dcf50a22d62f3ebd4 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 17 Dec 2024 11:35:00 +0100 Subject: [PATCH 25/56] postgres_ffi: add `WalStreamDecoder::complete_record()` benchmark (#10158) Touches #10097. --- Cargo.lock | 2 + libs/postgres_ffi/Cargo.toml | 6 +++ libs/postgres_ffi/benches/README.md | 26 +++++++++++++ libs/postgres_ffi/benches/waldecoder.rs | 49 +++++++++++++++++++++++++ libs/postgres_ffi/src/wal_generator.rs | 16 ++++++++ 5 files changed, 99 insertions(+) create mode 100644 libs/postgres_ffi/benches/README.md create mode 100644 libs/postgres_ffi/benches/waldecoder.rs diff --git a/Cargo.lock b/Cargo.lock index d1f77469699e..5ec5253719b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4401,11 +4401,13 @@ dependencies = [ "bindgen", "bytes", "crc32c", + "criterion", "env_logger", "log", "memoffset 0.9.0", "once_cell", "postgres", + "pprof", "regex", "serde", "thiserror", diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index e1f5443cbef3..b7a376841d45 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -9,9 +9,11 @@ regex.workspace = true bytes.workspace = true anyhow.workspace = true crc32c.workspace = true +criterion.workspace = true once_cell.workspace = true log.workspace = true memoffset.workspace = true +pprof.workspace = true thiserror.workspace = true serde.workspace = true utils.workspace = true @@ -24,3 +26,7 @@ postgres.workspace = true [build-dependencies] anyhow.workspace = true bindgen.workspace = true + +[[bench]] +name = "waldecoder" +harness = false diff --git a/libs/postgres_ffi/benches/README.md b/libs/postgres_ffi/benches/README.md new file mode 100644 index 000000000000..00a8980174fd --- /dev/null +++ b/libs/postgres_ffi/benches/README.md @@ -0,0 +1,26 @@ +## Benchmarks + +To run benchmarks: + +```sh +# All benchmarks. +cargo bench --package postgres_ffi + +# Specific file. +cargo bench --package postgres_ffi --bench waldecoder + +# Specific benchmark. +cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 + +# List available benchmarks. +cargo bench --package postgres_ffi --benches -- --list + +# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds. +# Output in target/criterion/*/profile/flamegraph.svg. +cargo bench --package postgres_ffi --bench waldecoder complete_record/size=1024 -- --profile-time 10 +``` + +Additional charts and statistics are available in `target/criterion/report/index.html`. + +Benchmarks are automatically compared against the previous run. To compare against other runs, see +`--baseline` and `--save-baseline`. \ No newline at end of file diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs new file mode 100644 index 000000000000..c8cf0d322a54 --- /dev/null +++ b/libs/postgres_ffi/benches/waldecoder.rs @@ -0,0 +1,49 @@ +use std::ffi::CStr; + +use criterion::{criterion_group, criterion_main, Bencher, Criterion}; +use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; +use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; +use postgres_ffi::waldecoder::WalStreamDecoder; +use pprof::criterion::{Output, PProfProfiler}; +use utils::lsn::Lsn; + +const KB: usize = 1024; + +// Register benchmarks with Criterion. +criterion_group!( + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_complete_record, +); +criterion_main!(benches); + +/// Benchmarks WalStreamDecoder::complete_record() for a logical message of varying size. +fn bench_complete_record(c: &mut Criterion) { + let mut g = c.benchmark_group("complete_record"); + for size in [64, KB, 8 * KB, 128 * KB] { + // Kind of weird to change the group throughput per benchmark, but it's the only way + // to vary it per benchmark. It works. + g.throughput(criterion::Throughput::Bytes(size as u64)); + g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap()); + } + + fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> { + const PREFIX: &CStr = c""; + let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX); + let value = vec![1; value_size]; + + let mut decoder = WalStreamDecoder::new(Lsn(0), 170000); + let msg = LogicalMessageGenerator::new(PREFIX, &value) + .next() + .unwrap() + .encode(Lsn(0)); + assert_eq!(msg.len(), size); + + b.iter(|| { + let msg = msg.clone(); // Bytes::clone() is cheap + decoder.complete_record(msg).unwrap(); + }); + + Ok(()) + } +} diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs index dc679eea3302..69cc4b771fa1 100644 --- a/libs/postgres_ffi/src/wal_generator.rs +++ b/libs/postgres_ffi/src/wal_generator.rs @@ -231,6 +231,22 @@ impl LogicalMessageGenerator { }; [&header.encode(), prefix, message].concat().into() } + + /// Computes how large a value must be to get a record of the given size. Convenience method to + /// construct records of pre-determined size. Panics if the record size is too small. + pub fn make_value_size(record_size: usize, prefix: &CStr) -> usize { + let xlog_header_size = XLOG_SIZE_OF_XLOG_RECORD; + let lm_header_size = size_of::(); + let prefix_size = prefix.to_bytes_with_nul().len(); + let data_header_size = match record_size - xlog_header_size - 2 { + 0..=255 => 2, + 256..=258 => panic!("impossible record_size {record_size}"), + 259.. => 5, + }; + record_size + .checked_sub(xlog_header_size + lm_header_size + prefix_size + data_header_size) + .expect("record_size too small") + } } impl Iterator for LogicalMessageGenerator { From b5833ef2594b39b6616526b60cbc8c506e5870fc Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 17 Dec 2024 12:24:51 +0000 Subject: [PATCH 26/56] remote_storage: configurable connection pooling for ABS (#10169) ## Problem The ABS SDK's default behavior is to do no connection pooling, i.e. open and close a fresh connection for each request. Under high request rates, this can result in an accumulation of TCP connections in TIME_WAIT or CLOSE_WAIT state, and in extreme cases exhaustion of client ports. Related: https://github.com/neondatabase/cloud/issues/20971 ## Summary of changes - Add a configurable `conn_pool_size` parameter for Azure storage, defaulting to zero (current behavior) - Construct a custom reqwest client using this connection pool size. --- Cargo.lock | 1 + libs/remote_storage/Cargo.toml | 1 + libs/remote_storage/src/azure_blob.rs | 20 ++++++++++++++++++-- libs/remote_storage/src/config.rs | 14 ++++++++++++++ libs/remote_storage/tests/test_real_azure.rs | 1 + 5 files changed, 35 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ec5253719b0..b9b89efa02ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5064,6 +5064,7 @@ dependencies = [ "once_cell", "pin-project-lite", "rand 0.8.5", + "reqwest", "scopeguard", "serde", "serde_json", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 1816825bda7a..33fa6e89f501 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -18,6 +18,7 @@ camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true hyper = { workspace = true, features = ["client"] } futures.workspace = true +reqwest.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 19c8251ccd37..c89f50ef2b24 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -8,6 +8,7 @@ use std::io; use std::num::NonZeroU32; use std::pin::Pin; use std::str::FromStr; +use std::sync::Arc; use std::time::Duration; use std::time::SystemTime; @@ -15,6 +16,8 @@ use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Context; use anyhow::Result; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; +use azure_core::HttpClient; +use azure_core::TransportOptions; use azure_core::{Continuable, RetryOptions}; use azure_storage::StorageCredentials; use azure_storage_blobs::blob::CopyStatus; @@ -80,8 +83,13 @@ impl AzureBlobStorage { StorageCredentials::token_credential(token_credential) }; - // we have an outer retry - let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none()); + let builder = ClientBuilder::new(account, credentials) + // we have an outer retry + .retry(RetryOptions::none()) + // Customize transport to configure conneciton pooling + .transport(TransportOptions::new(Self::reqwest_client( + azure_config.conn_pool_size, + ))); let client = builder.container_client(azure_config.container_name.to_owned()); @@ -106,6 +114,14 @@ impl AzureBlobStorage { }) } + fn reqwest_client(conn_pool_size: usize) -> Arc { + let client = reqwest::ClientBuilder::new() + .pool_max_idle_per_host(conn_pool_size) + .build() + .expect("failed to build `reqwest` client"); + Arc::new(client) + } + pub fn relative_path_to_name(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); let path_string = path.get_path().as_str(); diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index f6ef31077c76..dd49d4d5e710 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -114,6 +114,16 @@ fn default_max_keys_per_list_response() -> Option { DEFAULT_MAX_KEYS_PER_LIST_RESPONSE } +fn default_azure_conn_pool_size() -> usize { + // Conservative default: no connection pooling. At time of writing this is the Azure + // SDK's default as well, due to historic reports of hard-to-reproduce issues + // (https://github.com/hyperium/hyper/issues/2312) + // + // However, using connection pooling is important to avoid exhausting client ports when + // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971) + 0 +} + impl Debug for S3Config { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("S3Config") @@ -146,6 +156,8 @@ pub struct AzureConfig { pub concurrency_limit: NonZeroUsize, #[serde(default = "default_max_keys_per_list_response")] pub max_keys_per_list_response: Option, + #[serde(default = "default_azure_conn_pool_size")] + pub conn_pool_size: usize, } fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { @@ -302,6 +314,7 @@ timeout = '5s'"; container_region = 'westeurope' upload_storage_class = 'INTELLIGENT_TIERING' timeout = '7s' + conn_pool_size = 8 "; let config = parse(toml).unwrap(); @@ -316,6 +329,7 @@ timeout = '5s'"; prefix_in_container: None, concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + conn_pool_size: 8, }), timeout: Duration::from_secs(7), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 92d579fec866..15004dbf83f8 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -218,6 +218,7 @@ async fn create_azure_client( prefix_in_container: Some(format!("test_{millis}_{random:08x}/")), concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, + conn_pool_size: 8, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, From 2dfd3cab8cd7840afee57acffaa45aa803d04f20 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 17 Dec 2024 17:14:07 +0100 Subject: [PATCH 27/56] fix(compute): Report compute_backpressure_throttling_seconds as counter (#10125) ## Problem It was reported as `gauge`, but it's actually a `counter`. Also add `_total` suffix as that's the convention for counters. The corresponding flux-fleet PR: https://github.com/neondatabase/flux-fleet/pull/386 --- compute/etc/neon_collector.jsonnet | 2 +- ...compute_backpressure_throttling_seconds_total.libsonnet} | 6 +++--- ...ql => compute_backpressure_throttling_seconds_total.sql} | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename compute/etc/sql_exporter/{compute_backpressure_throttling_seconds.libsonnet => compute_backpressure_throttling_seconds_total.libsonnet} (61%) rename compute/etc/sql_exporter/{compute_backpressure_throttling_seconds.sql => compute_backpressure_throttling_seconds_total.sql} (100%) diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index aa6cc1cfc8a9..f8f4cab63ba2 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -3,7 +3,7 @@ metrics: [ import 'sql_exporter/checkpoints_req.libsonnet', import 'sql_exporter/checkpoints_timed.libsonnet', - import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet', + import 'sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet', import 'sql_exporter/compute_current_lsn.libsonnet', import 'sql_exporter/compute_logical_snapshot_files.libsonnet', import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet', diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet similarity index 61% rename from compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet index 02c803cfa6e6..31725bd179af 100644 --- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet +++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.libsonnet @@ -1,10 +1,10 @@ { - metric_name: 'compute_backpressure_throttling_seconds', - type: 'gauge', + metric_name: 'compute_backpressure_throttling_seconds_total', + type: 'counter', help: 'Time compute has spent throttled', key_labels: null, values: [ 'throttled', ], - query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql', + query: importstr 'sql_exporter/compute_backpressure_throttling_seconds_total.sql', } diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql similarity index 100% rename from compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql From 007b13b79af0fa75151a8fba58a65d3c4933942f Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Tue, 17 Dec 2024 16:43:54 +0000 Subject: [PATCH 28/56] Don't build tests in compute image, use ninja (#10149) Don't build tests in h3 and rdkit: ~15 min speedup. Use Ninja as cmake generator where possible: ~10 min speedup. Clean apt cache for smaller images: around 250mb size loss for intermediate layers --- compute/compute-node.Dockerfile | 78 ++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 33d2a1028521..1e11efeaf88e 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -35,10 +35,12 @@ RUN case $DEBIAN_VERSION in \ ;; \ esac && \ apt update && \ - apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \ + apt install --no-install-recommends --no-install-suggests -y \ + ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \ - $VERSION_INSTALLS + $VERSION_INSTALLS \ + && apt clean && rm -rf /var/lib/apt/lists/* ######################################################################################### # @@ -113,10 +115,12 @@ ARG DEBIAN_VERSION ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install --no-install-recommends -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ + apt install --no-install-recommends --no-install-suggests -y \ + gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ - protobuf-c-compiler xsltproc + protobuf-c-compiler xsltproc \ + && apt clean && rm -rf /var/lib/apt/lists/* # Postgis 3.5.0 requires SFCGAL 1.4+ @@ -143,9 +147,9 @@ RUN case "${DEBIAN_VERSION}" in \ wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \ echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \ mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ - cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ - DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ - make clean && cp -R /sfcgal/* / + cmake -DCMAKE_BUILD_TYPE=Release -GNinja . && ninja -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \ + ninja clean && cp -R /sfcgal/* / ENV PATH="/usr/local/pgsql/bin:$PATH" @@ -213,9 +217,9 @@ RUN case "${PG_VERSION}" in \ echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ + cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \ + ninja -j $(getconf _NPROCESSORS_ONLN) && \ + ninja -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \ @@ -235,7 +239,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch RUN apt update && \ - apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang + apt install --no-install-recommends --no-install-suggests -y \ + ninja-build python3-dev libncurses5 binutils clang \ + && apt clean && rm -rf /var/lib/apt/lists/* # plv8 3.2.3 supports v17 # last release v3.2.3 - Sep 7, 2024 @@ -301,9 +307,10 @@ RUN mkdir -p /h3/usr/ && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ - cmake .. -DCMAKE_BUILD_TYPE=Release && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - DESTDIR=/h3 make install && \ + cmake .. -GNinja -DBUILD_BENCHMARKS=0 -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_FUZZERS=0 -DBUILD_FILTERS=0 -DBUILD_GENERATORS=0 -DBUILD_TESTING=0 \ + && ninja -j $(getconf _NPROCESSORS_ONLN) && \ + DESTDIR=/h3 ninja install && \ cp -R /h3/usr / && \ rm -rf build @@ -650,14 +657,15 @@ FROM build-deps AS rdkit-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ - apt-get install --no-install-recommends -y \ +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ libeigen3-dev \ - libboost-all-dev + libboost-all-dev \ + && apt clean && rm -rf /var/lib/apt/lists/* # rdkit Release_2024_09_1 supports v17 # last release Release_2024_09_1 - Sep 27, 2024 @@ -693,6 +701,8 @@ RUN case "${PG_VERSION}" in \ -D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \ -D RDK_BUILD_YAEHMOP_SUPPORT=OFF \ -D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \ + -D RDK_TEST_MULTITHREADED=OFF \ + -D RDK_BUILD_CPP_TESTS=OFF \ -D RDK_USE_URF=OFF \ -D RDK_BUILD_PGSQL=ON \ -D RDK_PGSQL_STATIC=ON \ @@ -704,9 +714,10 @@ RUN case "${PG_VERSION}" in \ -D RDK_INSTALL_COMIC_FONTS=OFF \ -D RDK_BUILD_FREETYPE_SUPPORT=OFF \ -D CMAKE_BUILD_TYPE=Release \ + -GNinja \ . && \ - make -j $(getconf _NPROCESSORS_ONLN) && \ - make -j $(getconf _NPROCESSORS_ONLN) install && \ + ninja -j $(getconf _NPROCESSORS_ONLN) && \ + ninja -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control ######################################################################################### @@ -849,8 +860,9 @@ FROM build-deps AS rust-extensions-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ - apt-get install --no-install-recommends -y curl libclang-dev && \ +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ + apt clean && rm -rf /var/lib/apt/lists/* && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot @@ -885,8 +897,9 @@ FROM build-deps AS rust-extensions-build-pgrx12 ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN apt-get update && \ - apt-get install --no-install-recommends -y curl libclang-dev && \ +RUN apt update && \ + apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \ + apt clean && rm -rf /var/lib/apt/lists/* && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot @@ -914,18 +927,22 @@ FROM rust-extensions-build-pgrx12 AS pg-onnx-build # cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25). # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise -RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \ +RUN apt update && apt install --no-install-recommends --no-install-suggests -y \ + python3 python3-pip python3-venv && \ + apt clean && rm -rf /var/lib/apt/lists/* && \ python3 -m venv venv && \ . venv/bin/activate && \ python3 -m pip install cmake==3.30.5 && \ wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ - ./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root + ./build.sh --config Release --parallel --cmake_generator Ninja \ + --skip_submodule_sync --skip_tests --allow_running_as_root FROM pg-onnx-build AS pgrag-pg-build -RUN apt-get install -y protobuf-compiler && \ +RUN apt update && apt install --no-install-recommends --no-install-suggests -y protobuf-compiler \ + && apt clean && rm -rf /var/lib/apt/lists/* && \ wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \ @@ -1279,8 +1296,8 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_ FROM debian:$DEBIAN_FLAVOR AS pgbouncer RUN set -e \ - && apt-get update \ - && apt-get install --no-install-recommends -y \ + && apt update \ + && apt install --no-install-suggests --no-install-recommends -y \ build-essential \ git \ ca-certificates \ @@ -1288,7 +1305,8 @@ RUN set -e \ automake \ libevent-dev \ libtool \ - pkg-config + pkg-config \ + && apt clean && rm -rf /var/lib/apt/lists/* # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) ENV PGBOUNCER_TAG=pgbouncer_1_22_1 @@ -1519,7 +1537,7 @@ RUN apt update && \ procps \ ca-certificates \ $VERSION_INSTALLS && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ + apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 # s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2 From a55853f67fad71c06fd0b856ada7cf8fea4306bd Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 17 Dec 2024 17:51:58 +0100 Subject: [PATCH 29/56] utils: symbolize heap profiles (#10153) ## Problem Jemalloc heap profiles aren't symbolized. This is inconvenient, and doesn't work with Grafana Cloud Profiles. Resolves #9964. ## Summary of changes Symbolize the heap profiles in-process, and strip unnecessary cruft. This uses about 100 MB additional memory to cache the DWARF information, but I believe this is already the case with CPU profiles, which use the same library for symbolization. With cached DWARF information, the symbolization CPU overhead is negligible. Example profiles: * [pageserver.pb.gz](https://github.com/user-attachments/files/18141395/pageserver.pb.gz) * [safekeeper.pb.gz](https://github.com/user-attachments/files/18141396/safekeeper.pb.gz) --- Cargo.lock | 105 +++++++++++------- Cargo.toml | 1 + libs/utils/Cargo.toml | 5 +- libs/utils/src/http/endpoint.rs | 46 ++++++-- libs/utils/src/lib.rs | 2 + libs/utils/src/pprof.rs | 190 ++++++++++++++++++++++++++++++++ 6 files changed, 298 insertions(+), 51 deletions(-) create mode 100644 libs/utils/src/pprof.rs diff --git a/Cargo.lock b/Cargo.lock index b9b89efa02ce..d9ac167042ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" [[package]] name = "addr2line" -version = "0.21.0" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -23,6 +23,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "ahash" version = "0.8.11" @@ -871,17 +877,17 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.69" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" dependencies = [ "addr2line", - "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.8.0", "object", "rustc-demangle", + "windows-targets 0.52.6", ] [[package]] @@ -1127,7 +1133,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -2107,7 +2113,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.7.1", ] [[package]] @@ -2308,9 +2314,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.1" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "git-version" @@ -3404,6 +3410,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "0.8.11" @@ -3638,9 +3653,9 @@ dependencies = [ [[package]] name = "object" -version = "0.32.2" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] @@ -5323,9 +5338,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" @@ -7219,6 +7234,7 @@ dependencies = [ "anyhow", "arc-swap", "async-compression", + "backtrace", "bincode", "byteorder", "bytes", @@ -7229,12 +7245,14 @@ dependencies = [ "criterion", "diatomic-waker", "fail", + "flate2", "futures", "git-version", "hex", "hex-literal", "humantime", "hyper 0.14.30", + "itertools 0.10.5", "jemalloc_pprof", "jsonwebtoken", "metrics", @@ -7591,7 +7609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ "windows-core", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -7600,7 +7618,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -7618,7 +7636,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -7638,17 +7656,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -7659,9 +7678,9 @@ checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" @@ -7671,9 +7690,9 @@ checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" @@ -7683,9 +7702,15 @@ checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" @@ -7695,9 +7720,9 @@ checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" @@ -7707,9 +7732,9 @@ checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" @@ -7719,9 +7744,9 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" @@ -7731,9 +7756,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" diff --git a/Cargo.toml b/Cargo.toml index 056cd5798f84..885f02ba8190 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,6 +52,7 @@ anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } atomic-take = "1.1.0" +backtrace = "0.3.74" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 66500fb141bc..02bf77760a8e 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -15,17 +15,20 @@ arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true anyhow.workspace = true +backtrace.workspace = true bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true diatomic-waker.workspace = true +flate2.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true hyper0 = { workspace = true, features = ["full"] } +itertools.workspace = true fail.workspace = true -futures = { workspace = true} +futures = { workspace = true } jemalloc_pprof.workspace = true jsonwebtoken.workspace = true nix.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index d975b63677ac..9b37b699398e 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -1,15 +1,22 @@ use crate::auth::{AuthError, Claims, SwappableJwtAuth}; use crate::http::error::{api_error_handler, route_error_handler, ApiError}; use crate::http::request::{get_query_param, parse_query_param}; +use crate::pprof; +use ::pprof::protos::Message as _; +use ::pprof::ProfilerGuardBuilder; use anyhow::{anyhow, Context}; +use bytes::{Bytes, BytesMut}; use hyper::header::{HeaderName, AUTHORIZATION, CONTENT_DISPOSITION}; use hyper::http::HeaderValue; use hyper::Method; use hyper::{header::CONTENT_TYPE, Body, Request, Response}; use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; +use regex::Regex; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio::sync::{mpsc, Mutex}; +use tokio_stream::wrappers::ReceiverStream; use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; @@ -18,11 +25,6 @@ use std::io::Write as _; use std::str::FromStr; use std::time::Duration; -use bytes::{Bytes, BytesMut}; -use pprof::protos::Message as _; -use tokio::sync::{mpsc, Mutex}; -use tokio_stream::wrappers::ReceiverStream; - static SERVE_METRICS_COUNT: Lazy = Lazy::new(|| { register_int_counter!( "libmetrics_metric_handler_requests_total", @@ -365,7 +367,7 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A // Take the profile. let report = tokio::task::spawn_blocking(move || { - let guard = pprof::ProfilerGuardBuilder::default() + let guard = ProfilerGuardBuilder::default() .frequency(frequency_hz) .blocklist(&["libc", "libgcc", "pthread", "vdso"]) .build()?; @@ -457,10 +459,34 @@ pub async fn profile_heap_handler(req: Request) -> Result, } Format::Pprof => { - let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) - .await - .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? - .map_err(ApiError::InternalServerError)?; + let data = tokio::task::spawn_blocking(move || { + let bytes = prof_ctl.dump_pprof()?; + // Symbolize the profile. + // TODO: consider moving this upstream to jemalloc_pprof and avoiding the + // serialization roundtrip. + static STRIP_FUNCTIONS: Lazy> = Lazy::new(|| { + // Functions to strip from profiles. If true, also remove child frames. + vec![ + (Regex::new("^__rust").unwrap(), false), + (Regex::new("^_start$").unwrap(), false), + (Regex::new("^irallocx_prof").unwrap(), true), + (Regex::new("^prof_alloc_prep").unwrap(), true), + (Regex::new("^std::rt::lang_start").unwrap(), false), + (Regex::new("^std::sys::backtrace::__rust").unwrap(), false), + ] + }); + let profile = pprof::decode(&bytes)?; + let profile = pprof::symbolize(profile)?; + let profile = pprof::strip_locations( + profile, + &["libc", "libgcc", "pthread", "vdso"], + &STRIP_FUNCTIONS, + ); + pprof::encode(&profile) + }) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; Response::builder() .status(200) .header(CONTENT_TYPE, "application/octet-stream") diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index bccd0e048814..2c56dd750f75 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -96,6 +96,8 @@ pub mod circuit_breaker; pub mod try_rcu; +pub mod pprof; + // Re-export used in macro. Avoids adding git-version as dep in target crates. #[doc(hidden)] pub use git_version; diff --git a/libs/utils/src/pprof.rs b/libs/utils/src/pprof.rs new file mode 100644 index 000000000000..90910897bf17 --- /dev/null +++ b/libs/utils/src/pprof.rs @@ -0,0 +1,190 @@ +use flate2::write::{GzDecoder, GzEncoder}; +use flate2::Compression; +use itertools::Itertools as _; +use once_cell::sync::Lazy; +use pprof::protos::{Function, Line, Message as _, Profile}; +use regex::Regex; + +use std::borrow::Cow; +use std::collections::{HashMap, HashSet}; +use std::ffi::c_void; +use std::io::Write as _; + +/// Decodes a gzip-compressed Protobuf-encoded pprof profile. +pub fn decode(bytes: &[u8]) -> anyhow::Result { + let mut gz = GzDecoder::new(Vec::new()); + gz.write_all(bytes)?; + Ok(Profile::parse_from_bytes(&gz.finish()?)?) +} + +/// Encodes a pprof profile as gzip-compressed Protobuf. +pub fn encode(profile: &Profile) -> anyhow::Result> { + let mut gz = GzEncoder::new(Vec::new(), Compression::default()); + profile.write_to_writer(&mut gz)?; + Ok(gz.finish()?) +} + +/// Symbolizes a pprof profile using the current binary. +pub fn symbolize(mut profile: Profile) -> anyhow::Result { + if !profile.function.is_empty() { + return Ok(profile); // already symbolized + } + + // Collect function names. + let mut functions: HashMap = HashMap::new(); + let mut strings: HashMap = profile + .string_table + .into_iter() + .enumerate() + .map(|(i, s)| (s, i as i64)) + .collect(); + + // Helper to look up or register a string. + let mut string_id = |s: &str| -> i64 { + // Don't use .entry() to avoid unnecessary allocations. + if let Some(id) = strings.get(s) { + return *id; + } + let id = strings.len() as i64; + strings.insert(s.to_string(), id); + id + }; + + for loc in &mut profile.location { + if !loc.line.is_empty() { + continue; + } + + // Resolve the line and function for each location. + backtrace::resolve(loc.address as *mut c_void, |symbol| { + let Some(symname) = symbol.name() else { + return; + }; + let mut name = symname.to_string(); + + // Strip the Rust monomorphization suffix from the symbol name. + static SUFFIX_REGEX: Lazy = + Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex")); + if let Some(m) = SUFFIX_REGEX.find(&name) { + name.truncate(m.start()); + } + + let function_id = match functions.get(&name) { + Some(function) => function.id, + None => { + let id = functions.len() as u64 + 1; + let system_name = String::from_utf8_lossy(symname.as_bytes()); + let filename = symbol + .filename() + .map(|path| path.to_string_lossy()) + .unwrap_or(Cow::Borrowed("")); + let function = Function { + id, + name: string_id(&name), + system_name: string_id(&system_name), + filename: string_id(&filename), + ..Default::default() + }; + functions.insert(name, function); + id + } + }; + loc.line.push(Line { + function_id, + line: symbol.lineno().unwrap_or(0) as i64, + ..Default::default() + }); + }); + } + + // Store the resolved functions, and mark the mapping as resolved. + profile.function = functions.into_values().sorted_by_key(|f| f.id).collect(); + profile.string_table = strings + .into_iter() + .sorted_by_key(|(_, i)| *i) + .map(|(s, _)| s) + .collect(); + + for mapping in &mut profile.mapping { + mapping.has_functions = true; + mapping.has_filenames = true; + } + + Ok(profile) +} + +/// Strips locations (stack frames) matching the given mappings (substring) or function names +/// (regex). The function bool specifies whether child frames should be stripped as well. +/// +/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all +/// string references. +pub fn strip_locations( + mut profile: Profile, + mappings: &[&str], + functions: &[(Regex, bool)], +) -> Profile { + // Strip mappings. + let mut strip_mappings: HashSet = HashSet::new(); + + profile.mapping.retain(|mapping| { + let Some(name) = profile.string_table.get(mapping.filename as usize) else { + return true; + }; + if mappings.iter().any(|substr| name.contains(substr)) { + strip_mappings.insert(mapping.id); + return false; + } + true + }); + + // Strip functions. + let mut strip_functions: HashMap = HashMap::new(); + + profile.function.retain(|function| { + let Some(name) = profile.string_table.get(function.name as usize) else { + return true; + }; + for (regex, strip_children) in functions { + if regex.is_match(name) { + strip_functions.insert(function.id, *strip_children); + return false; + } + } + true + }); + + // Strip locations. The bool specifies whether child frames should be stripped too. + let mut strip_locations: HashMap = HashMap::new(); + + profile.location.retain(|location| { + for line in &location.line { + if let Some(strip_children) = strip_functions.get(&line.function_id) { + strip_locations.insert(location.id, *strip_children); + return false; + } + } + if strip_mappings.contains(&location.mapping_id) { + strip_locations.insert(location.id, false); + return false; + } + true + }); + + // Strip sample locations. + for sample in &mut profile.sample { + // First, find the uppermost function with child removal and truncate the stack. + if let Some(truncate) = sample + .location_id + .iter() + .rposition(|id| strip_locations.get(id) == Some(&true)) + { + sample.location_id.drain(..=truncate); + } + // Next, strip any individual frames without child removal. + sample + .location_id + .retain(|id| !strip_locations.contains_key(id)); + } + + profile +} From 7dddbb9570ca52d174b7d53ca2495fb272796a7f Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 17 Dec 2024 12:36:55 -0600 Subject: [PATCH 30/56] Add pg_repack extension (#10100) Our solutions engineers and some customers would like to have this extension available. Link: https://github.com/neondatabase/cloud/issues/18890 Signed-off-by: Tristan Partin --- compute/compute-node.Dockerfile | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 1e11efeaf88e..9f1f3b734363 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1185,6 +1185,25 @@ RUN case "${PG_VERSION}" in \ make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control +######################################################################################### +# +# Layer "pg_repack" +# compile pg_repack extension +# +######################################################################################### + +FROM build-deps AS pg-repack-build +ARG PG_VERSION +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH="/usr/local/pgsql/bin/:$PATH" + +RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \ + echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \ + mkdir pg_repack-src && cd pg_repack-src && tar xzf ../pg_repack.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -1230,6 +1249,7 @@ COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-repack-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 93e958341fc47317a0b430b40ae4906da7294d2f Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Tue, 17 Dec 2024 21:26:54 +0200 Subject: [PATCH 31/56] [proxy]: Use TLS for cancellation queries (#10152) ## Problem pg_sni_router assumes that all the streams are upgradable to TLS. Cancellation requests were declined because of using NoTls config. ## Summary of changes Provide TLS client config for cancellation requests. Fixes [#21789](https://github.com/orgs/neondatabase/projects/65/views/1?pane=issue&itemId=90911361&issue=neondatabase%7Ccloud%7C21789) --- proxy/src/bin/pg_sni_router.rs | 2 +- proxy/src/cancellation.rs | 61 ++++++++++++++++++++++++++++++++-- proxy/src/compute.rs | 6 ++-- 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 623a0fd3b2c9..9538384b9eca 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -229,7 +229,7 @@ async fn ssl_handshake( let (raw, read_buf) = stream.into_inner(); // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empy. + // server says TLS handshake is ok and read_buf is empty. // However, you could imagine pipelining of postgres // SSLRequest + TLS ClientHello in one hunk similar to // pipelining in our node js driver. We should probably diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index dd3edd6abc8f..a58e3961da86 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -3,8 +3,10 @@ use std::sync::Arc; use dashmap::DashMap; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; -use postgres_client::{CancelToken, NoTls}; +use once_cell::sync::OnceCell; +use postgres_client::{tls::MakeTlsConnect, CancelToken}; use pq_proto::CancelKeyData; +use rustls::crypto::ring; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; @@ -20,6 +22,9 @@ use crate::redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; +use crate::compute::{load_certs, AcceptEverythingVerifier}; +use crate::postgres_rustls::MakeRustlsConnect; + pub type CancelMap = Arc>>; pub type CancellationHandlerMain = CancellationHandler>>>; pub(crate) type CancellationHandlerMainInternal = Option>>; @@ -174,7 +179,10 @@ impl CancellationHandler

{ source: self.from, kind: crate::metrics::CancellationOutcome::Found, }); - info!("cancelling query per user's request using key {key}"); + info!( + "cancelling query per user's request using key {key}, hostname {}, address: {}", + cancel_closure.hostname, cancel_closure.socket_addr + ); cancel_closure.try_cancel_query().await } @@ -221,6 +229,8 @@ impl CancellationHandler>>> { } } +static TLS_ROOTS: OnceCell> = OnceCell::new(); + /// This should've been a [`std::future::Future`], but /// it's impossible to name a type of an unboxed future /// (we'd need something like `#![feature(type_alias_impl_trait)]`). @@ -229,6 +239,8 @@ pub struct CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, ip_allowlist: Vec, + hostname: String, // for pg_sni router + allow_self_signed_compute: bool, } impl CancelClosure { @@ -236,17 +248,60 @@ impl CancelClosure { socket_addr: SocketAddr, cancel_token: CancelToken, ip_allowlist: Vec, + hostname: String, + allow_self_signed_compute: bool, ) -> Self { Self { socket_addr, cancel_token, ip_allowlist, + hostname, + allow_self_signed_compute, } } /// Cancels the query running on user's compute node. pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; - self.cancel_token.cancel_query_raw(socket, NoTls).await?; + + let client_config = if self.allow_self_signed_compute { + // Allow all certificates for creating the connection. Used only for tests + let verifier = Arc::new(AcceptEverythingVerifier); + rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_safe_default_protocol_versions() + .expect("ring should support the default protocol versions") + .dangerous() + .with_custom_certificate_verifier(verifier) + } else { + let root_store = TLS_ROOTS + .get_or_try_init(load_certs) + .map_err(|_e| { + CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + "TLS root store initialization failed".to_string(), + )) + })? + .clone(); + rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_safe_default_protocol_versions() + .expect("ring should support the default protocol versions") + .with_root_certificates(root_store) + }; + + let client_config = client_config.with_no_client_auth(); + + let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config); + let tls = >::make_tls_connect( + &mut mk_tls, + &self.hostname, + ) + .map_err(|e| { + CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + )) + })?; + + self.cancel_token.cancel_query_raw(socket, tls).await?; debug!("query was cancelled"); Ok(()) } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 4113b5bb80e3..42df5ff5e3e5 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -319,6 +319,8 @@ impl ConnCfg { secret_key, }, vec![], + host.to_string(), + allow_self_signed_compute, ); let connection = PostgresConnection { @@ -350,7 +352,7 @@ fn filtered_options(options: &str) -> Option { Some(options) } -fn load_certs() -> Result, Vec> { +pub(crate) fn load_certs() -> Result, Vec> { let der_certs = rustls_native_certs::load_native_certs(); if !der_certs.errors.is_empty() { @@ -364,7 +366,7 @@ fn load_certs() -> Result, Vec> = OnceCell::new(); #[derive(Debug)] -struct AcceptEverythingVerifier; +pub(crate) struct AcceptEverythingVerifier; impl ServerCertVerifier for AcceptEverythingVerifier { fn supported_verify_schemes(&self) -> Vec { use rustls::SignatureScheme; From fd230227f2fd370821610f91601c9b02dda23e0d Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 17 Dec 2024 20:04:09 +0000 Subject: [PATCH 32/56] storcon: include preferred AZ in compute notifications (#9953) ## Problem It is unreliable for the control plane to infer the AZ for computes from where the tenant is currently attached, because if a tenant happens to be in a degraded state or a release is ongoing while a compute starts, then the tenant's attached AZ can be a different one to where it will run long-term, and the control plane doesn't check back later to restart the compute. This can land in parallel with https://github.com/neondatabase/neon/pull/9947 ## Summary of changes - Thread through the preferred AZ into the compute hook code via the reconciler - Include the preferred AZ in the body of compute hook notifications --- storage_controller/src/compute_hook.rs | 138 +++++++++++++----- storage_controller/src/reconciler.rs | 15 +- storage_controller/src/service.rs | 25 +++- storage_controller/src/tenant_shard.rs | 1 + test_runner/fixtures/neon_fixtures.py | 5 +- test_runner/regress/test_sharding.py | 3 + .../regress/test_storage_controller.py | 5 + 7 files changed, 139 insertions(+), 53 deletions(-) diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 2b2ece3f0271..69db48f8d18c 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::error::Error as _; use std::sync::Arc; use std::{collections::HashMap, time::Duration}; @@ -6,6 +7,7 @@ use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use futures::StreamExt; use hyper::StatusCode; +use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; use serde::{Deserialize, Serialize}; @@ -28,6 +30,9 @@ struct UnshardedComputeHookTenant { // Which node is this tenant attached to node_id: NodeId, + // The tenant's preferred AZ, so that we may pass this on to the control plane + preferred_az: Option, + // Must hold this lock to send a notification. send_lock: Arc>>, } @@ -36,6 +41,9 @@ struct ShardedComputeHookTenant { shard_count: ShardCount, shards: Vec<(ShardNumber, NodeId)>, + // The tenant's preferred AZ, so that we may pass this on to the control plane + preferred_az: Option, + // Must hold this lock to send a notification. The contents represent // the last successfully sent notification, and are used to coalesce multiple // updates by only sending when there is a chance since our last successful send. @@ -64,17 +72,24 @@ enum ComputeHookTenant { impl ComputeHookTenant { /// Construct with at least one shard's information - fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { + fn new( + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + preferred_az: Option, + node_id: NodeId, + ) -> Self { if tenant_shard_id.shard_count.count() > 1 { Self::Sharded(ShardedComputeHookTenant { shards: vec![(tenant_shard_id.shard_number, node_id)], stripe_size, shard_count: tenant_shard_id.shard_count, + preferred_az, send_lock: Arc::default(), }) } else { Self::Unsharded(UnshardedComputeHookTenant { node_id, + preferred_az, send_lock: Arc::default(), }) } @@ -120,15 +135,20 @@ impl ComputeHookTenant { /// Set one shard's location. If stripe size or shard count have changed, Self is reset /// and drops existing content. - fn update( - &mut self, - tenant_shard_id: TenantShardId, - stripe_size: ShardStripeSize, - node_id: NodeId, - ) { + fn update(&mut self, shard_update: ShardUpdate) { + let tenant_shard_id = shard_update.tenant_shard_id; + let node_id = shard_update.node_id; + let stripe_size = shard_update.stripe_size; + let preferred_az = shard_update.preferred_az; + match self { Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { - unsharded_tenant.node_id = node_id + unsharded_tenant.node_id = node_id; + if unsharded_tenant.preferred_az.as_ref() + != preferred_az.as_ref().map(|az| az.as_ref()) + { + unsharded_tenant.preferred_az = preferred_az.map(|az| az.as_ref().clone()); + } } Self::Sharded(sharded_tenant) if sharded_tenant.stripe_size == stripe_size @@ -146,10 +166,21 @@ impl ComputeHookTenant { .push((tenant_shard_id.shard_number, node_id)); sharded_tenant.shards.sort_by_key(|s| s.0) } + + if sharded_tenant.preferred_az.as_ref() + != preferred_az.as_ref().map(|az| az.as_ref()) + { + sharded_tenant.preferred_az = preferred_az.map(|az| az.as_ref().clone()); + } } _ => { // Shard count changed: reset struct. - *self = Self::new(tenant_shard_id, stripe_size, node_id); + *self = Self::new( + tenant_shard_id, + stripe_size, + preferred_az.map(|az| az.into_owned()), + node_id, + ); } } } @@ -165,6 +196,7 @@ struct ComputeHookNotifyRequestShard { #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] struct ComputeHookNotifyRequest { tenant_id: TenantId, + preferred_az: Option, stripe_size: Option, shards: Vec, } @@ -238,6 +270,10 @@ impl ComputeHookTenant { node_id: unsharded_tenant.node_id, }], stripe_size: None, + preferred_az: unsharded_tenant + .preferred_az + .as_ref() + .map(|az| az.0.clone()), }), Self::Sharded(sharded_tenant) if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => @@ -253,6 +289,7 @@ impl ComputeHookTenant { }) .collect(), stripe_size: Some(sharded_tenant.stripe_size), + preferred_az: sharded_tenant.preferred_az.as_ref().map(|az| az.0.clone()), }) } Self::Sharded(sharded_tenant) => { @@ -313,6 +350,17 @@ pub(super) struct ComputeHook { client: reqwest::Client, } +/// Callers may give us a list of these when asking us to send a bulk batch +/// of notifications in the background. This is a 'notification' in the sense of +/// other code notifying us of a shard's status, rather than being the final notification +/// that we send upwards to the control plane for the whole tenant. +pub(crate) struct ShardUpdate<'a> { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) node_id: NodeId, + pub(crate) stripe_size: ShardStripeSize, + pub(crate) preferred_az: Option>, +} + impl ComputeHook { pub(super) fn new(config: Config) -> Self { let authorization_header = config @@ -363,6 +411,7 @@ impl ComputeHook { tenant_id, shards, stripe_size, + preferred_az: _preferred_az, } = reconfigure_request; let compute_pageservers = shards @@ -503,24 +552,30 @@ impl ComputeHook { } /// Synchronous phase: update the per-tenant state for the next intended notification - fn notify_prepare( - &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - stripe_size: ShardStripeSize, - ) -> MaybeSendResult { + fn notify_prepare(&self, shard_update: ShardUpdate) -> MaybeSendResult { let mut state_locked = self.state.lock().unwrap(); use std::collections::hash_map::Entry; + let tenant_shard_id = shard_update.tenant_shard_id; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { - Entry::Vacant(e) => e.insert(ComputeHookTenant::new( - tenant_shard_id, - stripe_size, - node_id, - )), + Entry::Vacant(e) => { + let ShardUpdate { + tenant_shard_id, + node_id, + stripe_size, + preferred_az, + } = shard_update; + e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + preferred_az.map(|az| az.into_owned()), + node_id, + )) + } Entry::Occupied(e) => { let tenant = e.into_mut(); - tenant.update(tenant_shard_id, stripe_size, node_id); + tenant.update(shard_update); tenant } }; @@ -608,13 +663,14 @@ impl ComputeHook { /// if something failed. pub(super) fn notify_background( self: &Arc, - notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, + notifications: Vec, result_tx: tokio::sync::mpsc::Sender>, cancel: &CancellationToken, ) { let mut maybe_sends = Vec::new(); - for (tenant_shard_id, node_id, stripe_size) in notifications { - let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + for shard_update in notifications { + let tenant_shard_id = shard_update.tenant_shard_id; + let maybe_send_result = self.notify_prepare(shard_update); maybe_sends.push((tenant_shard_id, maybe_send_result)) } @@ -678,15 +734,14 @@ impl ComputeHook { /// periods, but we don't retry forever. The **caller** is responsible for handling failures and /// ensuring that they eventually call again to ensure that the compute is eventually notified of /// the proper pageserver nodes for a tenant. - #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] - pub(super) async fn notify( + #[tracing::instrument(skip_all, fields(tenant_id=%shard_update.tenant_shard_id.tenant_id, shard_id=%shard_update.tenant_shard_id.shard_slug(), node_id))] + pub(super) async fn notify<'a>( &self, - tenant_shard_id: TenantShardId, - node_id: NodeId, - stripe_size: ShardStripeSize, + shard_update: ShardUpdate<'a>, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + let tenant_shard_id = shard_update.tenant_shard_id; + let maybe_send_result = self.notify_prepare(shard_update); self.notify_execute(maybe_send_result, tenant_shard_id, cancel) .await } @@ -739,6 +794,7 @@ pub(crate) mod tests { shard_number: ShardNumber(0), }, ShardStripeSize(12345), + None, NodeId(1), ); @@ -765,30 +821,32 @@ pub(crate) mod tests { // Writing the first shard of a multi-sharded situation (i.e. in a split) // resets the tenant state and puts it in an non-notifying state (need to // see all shards) - tenant_state.update( - TenantShardId { + tenant_state.update(ShardUpdate { + tenant_shard_id: TenantShardId { tenant_id, shard_count: ShardCount::new(2), shard_number: ShardNumber(1), }, - ShardStripeSize(32768), - NodeId(1), - ); + stripe_size: ShardStripeSize(32768), + preferred_az: None, + node_id: NodeId(1), + }); assert!(matches!( tenant_state.maybe_send(tenant_id, None), MaybeSendResult::Noop )); // Writing the second shard makes it ready to notify - tenant_state.update( - TenantShardId { + tenant_state.update(ShardUpdate { + tenant_shard_id: TenantShardId { tenant_id, shard_count: ShardCount::new(2), shard_number: ShardNumber(0), }, - ShardStripeSize(32768), - NodeId(1), - ); + stripe_size: ShardStripeSize(32768), + preferred_az: None, + node_id: NodeId(1), + }); let send_result = tenant_state.maybe_send(tenant_id, None); let MaybeSendResult::Transmit((request, mut guard)) = send_result else { diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 3ad386a95b57..475f91eff48d 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,13 +1,14 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; -use crate::service; -use pageserver_api::controller_api::PlacementPolicy; +use crate::{compute_hook, service}; +use pageserver_api::controller_api::{AvailabilityZone, PlacementPolicy}; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use pageserver_client::mgmt_api; use reqwest::StatusCode; +use std::borrow::Cow; use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -45,6 +46,7 @@ pub(super) struct Reconciler { pub(crate) reconciler_config: ReconcilerConfig, pub(crate) config: TenantConfig, + pub(crate) preferred_az: Option, /// Observed state from the point of view of the reconciler. /// This gets updated as the reconciliation makes progress. @@ -834,9 +836,12 @@ impl Reconciler { let result = self .compute_hook .notify( - self.tenant_shard_id, - node.get_id(), - self.shard.stripe_size, + compute_hook::ShardUpdate { + tenant_shard_id: self.tenant_shard_id, + node_id: node.get_id(), + stripe_size: self.shard.stripe_size, + preferred_az: self.preferred_az.as_ref().map(Cow::Borrowed), + }, &self.cancel, ) .await; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index a89e4741f6ae..42b50835f808 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -18,7 +18,7 @@ use crate::{ background_node_operations::{ Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, }, - compute_hook::NotifyError, + compute_hook::{self, NotifyError}, drain_utils::{self, TenantShardDrain, TenantShardIterator}, id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard}, leadership::Leadership, @@ -656,11 +656,14 @@ impl Service { // emit a compute notification for this. In the case where our observed state does not // yet match our intent, we will eventually reconcile, and that will emit a compute notification. if let Some(attached_at) = tenant_shard.stably_attached() { - compute_notifications.push(( - *tenant_shard_id, - attached_at, - tenant_shard.shard.stripe_size, - )); + compute_notifications.push(compute_hook::ShardUpdate { + tenant_shard_id: *tenant_shard_id, + node_id: attached_at, + stripe_size: tenant_shard.shard.stripe_size, + preferred_az: tenant_shard + .preferred_az() + .map(|az| Cow::Owned(az.clone())), + }); } } } @@ -4786,7 +4789,15 @@ impl Service { for (child_id, child_ps, stripe_size) in child_locations { if let Err(e) = self .compute_hook - .notify(child_id, child_ps, stripe_size, &self.cancel) + .notify( + compute_hook::ShardUpdate { + tenant_shard_id: child_id, + node_id: child_ps, + stripe_size, + preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed), + }, + &self.cancel, + ) .await { tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index f1b921646f44..cba579e8a749 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1198,6 +1198,7 @@ impl TenantShard { detach, reconciler_config, config: self.config.clone(), + preferred_az: self.preferred_az_id.clone(), observed: self.observed.clone(), original_observed: self.observed.clone(), compute_hook: compute_hook.clone(), diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 13ada1361e63..2553a0c99ab0 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -134,6 +134,9 @@ BASE_PORT: int = 15000 +# By default we create pageservers with this phony AZ +DEFAULT_AZ_ID: str = "us-east-2a" + @pytest.fixture(scope="session") def neon_api_key() -> str: @@ -1093,7 +1096,7 @@ def __init__(self, config: NeonEnvBuilder): "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` - "availability_zone": "us-east-2a", + "availability_zone": DEFAULT_AZ_ID, # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 743ab0088b3d..4c381b563fe4 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -11,6 +11,7 @@ from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + DEFAULT_AZ_ID, NeonEnv, NeonEnvBuilder, StorageControllerApiException, @@ -793,6 +794,7 @@ def handler(request: Request): "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect @@ -812,6 +814,7 @@ def handler(request: Request): {"node_id": int(env.pageservers[0].id), "shard_number": 0}, {"node_id": int(env.pageservers[0].id), "shard_number": 1}, ], + "preferred_az": DEFAULT_AZ_ID, } log.info(f"Got notification: {notifications[1]}") assert notifications[1] == expect_after diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index ae9b596a1b98..0be800d10305 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -16,6 +16,7 @@ from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + DEFAULT_AZ_ID, NeonEnv, NeonEnvBuilder, NeonPageserver, @@ -599,6 +600,7 @@ def handler(request: Request): "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect @@ -616,6 +618,7 @@ def node_evacuated(node_id: int) -> None: "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } def received_migration_notification(): @@ -643,6 +646,7 @@ def received_restart_notification(): {"node_id": int(env.pageservers[1].id), "shard_number": 0}, {"node_id": int(env.pageservers[1].id), "shard_number": 1}, ], + "preferred_az": DEFAULT_AZ_ID, } def received_split_notification(): @@ -714,6 +718,7 @@ def handler(request: Request): "tenant_id": str(env.initial_tenant), "stripe_size": None, "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, } assert notifications[0] == expect From 2ee6bc5ec42590e958fc246092f8e98202fc9173 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 17 Dec 2024 20:06:18 +0000 Subject: [PATCH 33/56] chore(proxy): update vendored postgres libs to edition 2021 (#10139) I ran `cargo fix --edition` in each project prior, and it found nothing that needed fixing. --- libs/proxy/postgres-protocol2/Cargo.toml | 2 +- libs/proxy/postgres-protocol2/src/lib.rs | 3 +-- libs/proxy/postgres-protocol2/src/message/frontend.rs | 1 - libs/proxy/postgres-types2/Cargo.toml | 2 +- libs/proxy/postgres-types2/src/lib.rs | 3 +-- libs/proxy/tokio-postgres2/Cargo.toml | 2 +- libs/proxy/tokio-postgres2/src/lib.rs | 2 +- 7 files changed, 6 insertions(+), 9 deletions(-) diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml index f71c1599c7c2..f66a292d5eac 100644 --- a/libs/proxy/postgres-protocol2/Cargo.toml +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-protocol2" version = "0.1.0" -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs index 947f2f835d4b..6032440f9ad5 100644 --- a/libs/proxy/postgres-protocol2/src/lib.rs +++ b/libs/proxy/postgres-protocol2/src/lib.rs @@ -9,8 +9,7 @@ //! //! This library assumes that the `client_encoding` backend parameter has been //! set to `UTF8`. It will most likely not behave properly if that is not the case. -#![doc(html_root_url = "https://docs.rs/postgres-protocol/0.6")] -#![warn(missing_docs, rust_2018_idioms, clippy::all)] +#![warn(missing_docs, clippy::all)] use byteorder::{BigEndian, ByteOrder}; use bytes::{BufMut, BytesMut}; diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs index bc6168f33732..640f35ada3be 100644 --- a/libs/proxy/postgres-protocol2/src/message/frontend.rs +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -3,7 +3,6 @@ use byteorder::{BigEndian, ByteOrder}; use bytes::{Buf, BufMut, BytesMut}; -use std::convert::TryFrom; use std::error::Error; use std::io; use std::marker; diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml index 58cfb5571f83..57efd94cd31b 100644 --- a/libs/proxy/postgres-types2/Cargo.toml +++ b/libs/proxy/postgres-types2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "postgres-types2" version = "0.1.0" -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index 18ba032151a9..d4f3afdfd46c 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -2,8 +2,7 @@ //! //! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it //! unless you want to define your own `ToSql` or `FromSql` definitions. -#![doc(html_root_url = "https://docs.rs/postgres-types/0.2")] -#![warn(clippy::all, rust_2018_idioms, missing_docs)] +#![warn(clippy::all, missing_docs)] use fallible_iterator::FallibleIterator; use postgres_protocol2::types; diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml index 7130c1b7266f..56e7c4da47ce 100644 --- a/libs/proxy/tokio-postgres2/Cargo.toml +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "tokio-postgres2" version = "0.1.0" -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" [dependencies] diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 901ed0c96c68..9155dd82792a 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -1,5 +1,5 @@ //! An asynchronous, pipelined, PostgreSQL client. -#![warn(rust_2018_idioms, clippy::all)] +#![warn(clippy::all)] pub use crate::cancel_token::CancelToken; pub use crate::client::{Client, SocketConfig}; From c52514ab02b96e273f8a99362e2401e4a7bc3982 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Tue, 17 Dec 2024 21:47:44 +0100 Subject: [PATCH 34/56] Fix allure report creation on periodic `pg_regress` testing (#10171) ## Problem The allure report finishes with the error `HttpError: Resource not accessible by integration` while running the `pg_regress` test against a cloud staging project due to a lack of permissions. ## Summary of changes The permissions are added. --- .github/workflows/cloud-regress.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 55f42ea5332b..09d6acd32561 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -21,6 +21,8 @@ concurrency: permissions: id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write jobs: regress: From aaf980f70d38d6a6a54494aea1b6b16e5328abf5 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 18 Dec 2024 11:34:38 +0200 Subject: [PATCH 35/56] Online checkpoint replication state (#9976) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1733180965970089 Replication state is checkpointed only by shutdown checkpoint. It means that replication snapshots are not removed till compute shutdown. ## Summary of changes Checkpoint replication state during online checkpoint Related Postgres PR: https://github.com/neondatabase/postgres/pull/546 Co-authored-by: Konstantin Knizhnik --- vendor/postgres-v17 | 2 +- vendor/revisions.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 65c4e46baf56..7e3f3974bc88 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 65c4e46baf56ec05412c7dd63d62faff0b33dcfb +Subproject commit 7e3f3974bc8895938308f94d0e96879ffae638cd diff --git a/vendor/revisions.json b/vendor/revisions.json index c8db81c73f7e..bff2f709318a 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,7 +1,7 @@ { "v17": [ "17.2", - "65c4e46baf56ec05412c7dd63d62faff0b33dcfb" + "7e3f3974bc8895938308f94d0e96879ffae638cd" ], "v16": [ "16.6", From 85696297c5dc15b74384441f318c9381d99086e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 18 Dec 2024 13:47:56 +0100 Subject: [PATCH 36/56] Add safekeepers command to storcon_cli for listing (#10151) Add a `safekeepers` subcommand to `storcon_cli` that allows listing the safekeepers. ``` $ curl -X POST --url http://localhost:1234/control/v1/safekeeper/42 --data \ '{"active":true, "id":42, "created_at":"2023-10-25T09:11:25Z", "updated_at":"2024-08-28T11:32:43Z","region_id":"neon_local","host":"localhost","port":5454,"http_port":0,"version":123,"availability_zone_id":"us-east-2b"}' $ cargo run --bin storcon_cli -- --api http://localhost:1234 safekeepers Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.38s Running `target/debug/storcon_cli --api 'http://localhost:1234' safekeepers` +----+---------+-----------+------+-----------+------------+ | Id | Version | Host | Port | Http Port | AZ Id | +==========================================================+ | 42 | 123 | localhost | 5454 | 0 | us-east-2b | +----+---------+-----------+------+-----------+------------+ ``` Also: * Don't return the raw `SafekeeperPersistence` struct that contains the raw database presentation, but instead a new `SafekeeperDescribeResponse` struct. * The `SafekeeperPersistence` struct leaves out the `active` field on purpose because we want to deprecate it and replace it with a `scheduling_policy` one. Part of https://github.com/neondatabase/neon/issues/9981 --- control_plane/storcon_cli/src/main.rs | 30 ++++++++++++++++++- libs/pageserver_api/src/controller_api.rs | 17 +++++++++++ storage_controller/src/persistence.rs | 13 ++++++++ storage_controller/src/service.rs | 26 +++++++++++----- .../regress/test_storage_controller.py | 2 +- 5 files changed, 78 insertions(+), 10 deletions(-) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index df07216fde78..6ee1044c1839 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -5,7 +5,8 @@ use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, - ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, + SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, + TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, @@ -211,6 +212,8 @@ enum Command { #[arg(long)] timeout: humantime::Duration, }, + /// List safekeepers known to the storage controller + Safekeepers {}, } #[derive(Parser)] @@ -1020,6 +1023,31 @@ async fn main() -> anyhow::Result<()> { "Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}" ); } + Command::Safekeepers {} => { + let mut resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/safekeeper".to_string(), + None, + ) + .await?; + + resp.sort_by(|a, b| a.id.cmp(&b.id)); + + let mut table = comfy_table::Table::new(); + table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]); + for sk in resp { + table.add_row([ + format!("{}", sk.id), + format!("{}", sk.version), + sk.host, + format!("{}", sk.port), + format!("{}", sk.http_port), + sk.availability_zone_id.to_string(), + ]); + } + println!("{table}"); + } } Ok(()) diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index ec7b81423a44..faf11e487c55 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -372,6 +372,23 @@ pub struct MetadataHealthListOutdatedResponse { pub health_records: Vec, } +/// Publicly exposed safekeeper description +/// +/// The `active` flag which we have in the DB is not included on purpose: it is deprecated. +#[derive(Serialize, Deserialize, Clone)] +pub struct SafekeeperDescribeResponse { + pub id: NodeId, + pub region_id: String, + /// 1 is special, it means just created (not currently posted to storcon). + /// Zero or negative is not really expected. + /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. + pub version: i64, + pub host: String, + pub port: i32, + pub http_port: i32, + pub availability_zone_id: String, +} + #[cfg(test)] mod test { use super::*; diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index e17fe78d2584..cc377e606ee8 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -11,6 +11,7 @@ use diesel::Connection; use itertools::Itertools; use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::controller_api::MetadataHealthRecord; +use pageserver_api::controller_api::SafekeeperDescribeResponse; use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; @@ -1241,6 +1242,18 @@ impl SafekeeperPersistence { availability_zone_id: &self.availability_zone_id, } } + pub(crate) fn as_describe_response(&self) -> SafekeeperDescribeResponse { + // omit the `active` flag on purpose: it is deprecated. + SafekeeperDescribeResponse { + id: NodeId(self.id as u64), + region_id: self.region_id.clone(), + version: self.version, + host: self.host.clone(), + port: self.port, + http_port: self.http_port, + availability_zone_id: self.availability_zone_id.clone(), + } + } } #[derive(Insertable, AsChangeset)] diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 42b50835f808..c0c5bc371aed 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -46,10 +46,11 @@ use pageserver_api::{ controller_api::{ AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, - ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, - TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, - TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, - TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, + SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, + ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse, + TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, + TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, }, models::{ SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest, @@ -7169,15 +7170,24 @@ impl Service { pub(crate) async fn safekeepers_list( &self, - ) -> Result, DatabaseError> { - self.persistence.list_safekeepers().await + ) -> Result, DatabaseError> { + Ok(self + .persistence + .list_safekeepers() + .await? + .into_iter() + .map(|v| v.as_describe_response()) + .collect::>()) } pub(crate) async fn get_safekeeper( &self, id: i64, - ) -> Result { - self.persistence.safekeeper_get(id).await + ) -> Result { + self.persistence + .safekeeper_get(id) + .await + .map(|v| v.as_describe_response()) } pub(crate) async fn upsert_safekeeper( diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 0be800d10305..7062c35e05ab 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3009,7 +3009,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: compared = [dict(a), dict(b)] - masked_keys = ["created_at", "updated_at"] + masked_keys = ["created_at", "updated_at", "active"] for d in compared: # keep deleting these in case we are comparing the body as it will be uploaded by real scripts From 1d12efc42886bcb204db4c764ea64413da5c8dba Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:37:26 -0500 Subject: [PATCH 37/56] fix(pageserver): allow repartition errors during gc-compaction smoke tests (#10164) ## Problem part of https://github.com/neondatabase/neon/issues/9114 In https://github.com/neondatabase/neon/pull/10127 we fixed the race, but we didn't add the errors to the allowlist. ## Summary of changes * Allow repartition errors in the gc-compaction smoke test. I think it might be worth to refactor the code to allow multiple threads getting a copy of repartition status (i.e., using Rcu) in the future. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 2 +- test_runner/regress/test_compaction.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8b6cc8ed8413..a4e8f3952265 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1823,7 +1823,7 @@ impl Timeline { // by estimating the amount of files read for a compaction job. We should also partition on LSN. let ((dense_ks, sparse_ks), _) = { let Ok(partition) = self.partitioning.try_lock() else { - bail!("failed to acquire partition lock"); + bail!("failed to acquire partition lock during gc-compaction"); }; partition.clone() }; diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 88873c63c24c..aef9a825ee36 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -134,6 +134,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): } env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + env.pageserver.allowed_errors.append( + r".*failed to acquire partition lock during gc-compaction.*" + ) + env.pageserver.allowed_errors.append(r".*repartition() called concurrently.*") tenant_id = env.initial_tenant timeline_id = env.initial_timeline From 1668d39b7cbe7d1bbe48152cb1b4024a6e2da90a Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 18 Dec 2024 16:51:53 +0100 Subject: [PATCH 38/56] safekeeper: fix typo in allowlist for `/profile/heap` (#10186) --- safekeeper/src/http/routes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 9bc1bf340919..6186f4c3ba05 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -564,7 +564,7 @@ pub fn make_router( if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { const ALLOWLIST_ROUTES: &[&str] = - &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"]; + &["/v1/status", "/metrics", "/profile/cpu", "/profile/heap"]; if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { From d63602cc7822eb5f9670d7e7926bb412eba1ff3a Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 18 Dec 2024 16:03:14 +0000 Subject: [PATCH 39/56] chore(proxy): fully remove allow-self-signed-compute flag (#10168) When https://github.com/neondatabase/cloud/pull/21856 is merged, this flag is no longer necessary. --- proxy/src/bin/local_proxy.rs | 1 - proxy/src/bin/proxy.rs | 7 --- proxy/src/cancellation.rs | 44 +++++++---------- proxy/src/compute.rs | 69 +++------------------------ proxy/src/config.rs | 1 - proxy/src/console_redirect_proxy.rs | 1 - proxy/src/control_plane/mod.rs | 5 +- proxy/src/proxy/connect_compute.rs | 9 +--- proxy/src/proxy/mod.rs | 2 - test_runner/fixtures/neon_fixtures.py | 1 - 10 files changed, 25 insertions(+), 115 deletions(-) diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 968682cf0f75..56bbd9485011 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -271,7 +271,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig Ok(Box::leak(Box::new(ProxyConfig { tls_config: None, metric_collection: None, - allow_self_signed_compute: false, http_config, authentication_config: AuthenticationConfig { jwks_cache: JwkCache::default(), diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index e90555e250b8..3dcf9ca060c0 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -129,9 +129,6 @@ struct ProxyCliArgs { /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] connect_compute_lock: String, - /// Allow self-signed certificates for compute nodes (for testing) - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - allow_self_signed_compute: bool, #[clap(flatten)] sql_over_http: SqlOverHttpArgs, /// timeout for scram authentication protocol @@ -564,9 +561,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; - if args.allow_self_signed_compute { - warn!("allowing self-signed compute certificates"); - } let backup_metric_collection_config = config::MetricBackupCollectionConfig { interval: args.metric_backup_collection_interval, remote_storage_config: args.metric_backup_collection_remote_storage.clone(), @@ -641,7 +635,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { let config = ProxyConfig { tls_config, metric_collection, - allow_self_signed_compute: args.allow_self_signed_compute, http_config, authentication_config, proxy_protocol_v2: args.proxy_protocol_v2, diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index a58e3961da86..ebaea173ae48 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -4,7 +4,8 @@ use std::sync::Arc; use dashmap::DashMap; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use once_cell::sync::OnceCell; -use postgres_client::{tls::MakeTlsConnect, CancelToken}; +use postgres_client::tls::MakeTlsConnect; +use postgres_client::CancelToken; use pq_proto::CancelKeyData; use rustls::crypto::ring; use thiserror::Error; @@ -14,17 +15,16 @@ use tracing::{debug, info}; use uuid::Uuid; use crate::auth::{check_peer_addr_is_in_list, IpPattern}; +use crate::compute::load_certs; use crate::error::ReportableError; use crate::ext::LockExt; use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::postgres_rustls::MakeRustlsConnect; use crate::rate_limiter::LeakyBucketRateLimiter; use crate::redis::cancellation_publisher::{ CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; -use crate::compute::{load_certs, AcceptEverythingVerifier}; -use crate::postgres_rustls::MakeRustlsConnect; - pub type CancelMap = Arc>>; pub type CancellationHandlerMain = CancellationHandler>>>; pub(crate) type CancellationHandlerMainInternal = Option>>; @@ -240,7 +240,6 @@ pub struct CancelClosure { cancel_token: CancelToken, ip_allowlist: Vec, hostname: String, // for pg_sni router - allow_self_signed_compute: bool, } impl CancelClosure { @@ -249,45 +248,34 @@ impl CancelClosure { cancel_token: CancelToken, ip_allowlist: Vec, hostname: String, - allow_self_signed_compute: bool, ) -> Self { Self { socket_addr, cancel_token, ip_allowlist, hostname, - allow_self_signed_compute, } } /// Cancels the query running on user's compute node. pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; - let client_config = if self.allow_self_signed_compute { - // Allow all certificates for creating the connection. Used only for tests - let verifier = Arc::new(AcceptEverythingVerifier); - rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_safe_default_protocol_versions() - .expect("ring should support the default protocol versions") - .dangerous() - .with_custom_certificate_verifier(verifier) - } else { - let root_store = TLS_ROOTS - .get_or_try_init(load_certs) - .map_err(|_e| { - CancelError::IO(std::io::Error::new( - std::io::ErrorKind::Other, - "TLS root store initialization failed".to_string(), - )) - })? - .clone(); + let root_store = TLS_ROOTS + .get_or_try_init(load_certs) + .map_err(|_e| { + CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + "TLS root store initialization failed".to_string(), + )) + })? + .clone(); + + let client_config = rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_safe_default_protocol_versions() .expect("ring should support the default protocol versions") .with_root_certificates(root_store) - }; - - let client_config = client_config.with_no_client_auth(); + .with_no_client_auth(); let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config); let tls = >::make_tls_connect( diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 42df5ff5e3e5..8dc9b59e81c5 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -10,7 +10,6 @@ use postgres_client::tls::MakeTlsConnect; use postgres_client::{CancelToken, RawConnection}; use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; -use rustls::client::danger::ServerCertVerifier; use rustls::crypto::ring; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; @@ -251,7 +250,6 @@ impl ConnCfg { pub(crate) async fn connect( &self, ctx: &RequestContext, - allow_self_signed_compute: bool, aux: MetricsAuxInfo, timeout: Duration, ) -> Result { @@ -259,25 +257,17 @@ impl ConnCfg { let (socket_addr, stream, host) = self.connect_raw(timeout).await?; drop(pause); - let client_config = if allow_self_signed_compute { - // Allow all certificates for creating the connection - let verifier = Arc::new(AcceptEverythingVerifier); - rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_safe_default_protocol_versions() - .expect("ring should support the default protocol versions") - .dangerous() - .with_custom_certificate_verifier(verifier) - } else { - let root_store = TLS_ROOTS - .get_or_try_init(load_certs) - .map_err(ConnectionError::TlsCertificateError)? - .clone(); + let root_store = TLS_ROOTS + .get_or_try_init(load_certs) + .map_err(ConnectionError::TlsCertificateError)? + .clone(); + + let client_config = rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider())) .with_safe_default_protocol_versions() .expect("ring should support the default protocol versions") .with_root_certificates(root_store) - }; - let client_config = client_config.with_no_client_auth(); + .with_no_client_auth(); let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config); let tls = >::make_tls_connect( @@ -320,7 +310,6 @@ impl ConnCfg { }, vec![], host.to_string(), - allow_self_signed_compute, ); let connection = PostgresConnection { @@ -365,50 +354,6 @@ pub(crate) fn load_certs() -> Result, Vec> = OnceCell::new(); -#[derive(Debug)] -pub(crate) struct AcceptEverythingVerifier; -impl ServerCertVerifier for AcceptEverythingVerifier { - fn supported_verify_schemes(&self) -> Vec { - use rustls::SignatureScheme; - // The schemes for which `SignatureScheme::supported_in_tls13` returns true. - vec![ - SignatureScheme::ECDSA_NISTP521_SHA512, - SignatureScheme::ECDSA_NISTP384_SHA384, - SignatureScheme::ECDSA_NISTP256_SHA256, - SignatureScheme::RSA_PSS_SHA512, - SignatureScheme::RSA_PSS_SHA384, - SignatureScheme::RSA_PSS_SHA256, - SignatureScheme::ED25519, - ] - } - fn verify_server_cert( - &self, - _end_entity: &rustls::pki_types::CertificateDer<'_>, - _intermediates: &[rustls::pki_types::CertificateDer<'_>], - _server_name: &rustls::pki_types::ServerName<'_>, - _ocsp_response: &[u8], - _now: rustls::pki_types::UnixTime, - ) -> Result { - Ok(rustls::client::danger::ServerCertVerified::assertion()) - } - fn verify_tls12_signature( - &self, - _message: &[u8], - _cert: &rustls::pki_types::CertificateDer<'_>, - _dss: &rustls::DigitallySignedStruct, - ) -> Result { - Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) - } - fn verify_tls13_signature( - &self, - _message: &[u8], - _cert: &rustls::pki_types::CertificateDer<'_>, - _dss: &rustls::DigitallySignedStruct, - ) -> Result { - Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index debd77ac3296..33d1d2e9e4a0 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -25,7 +25,6 @@ use crate::types::Host; pub struct ProxyConfig { pub tls_config: Option, pub metric_collection: Option, - pub allow_self_signed_compute: bool, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub proxy_protocol_v2: ProxyProtocolV2, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 02398fb7778c..c477822e853c 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -213,7 +213,6 @@ pub(crate) async fn handle_client( params_compat: true, params: ¶ms, locks: &config.connect_compute_locks, - allow_self_signed_compute: config.allow_self_signed_compute, }, &user_info, config.wake_compute_retry_config, diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index c0718920b493..0ca1a6aae0eb 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -73,12 +73,9 @@ impl NodeInfo { pub(crate) async fn connect( &self, ctx: &RequestContext, - allow_self_signed_compute: bool, timeout: Duration, ) -> Result { - self.config - .connect(ctx, allow_self_signed_compute, self.aux.clone(), timeout) - .await + self.config.connect(ctx, self.aux.clone(), timeout).await } pub(crate) fn reuse_settings(&mut self, other: Self) { diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 6da4c90a535b..4a30d2398558 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -73,9 +73,6 @@ pub(crate) struct TcpMechanism<'a> { /// connect_to_compute concurrency lock pub(crate) locks: &'static ApiLocks, - - /// Whether we should accept self-signed certificates (for testing) - pub(crate) allow_self_signed_compute: bool, } #[async_trait] @@ -93,11 +90,7 @@ impl ConnectMechanism for TcpMechanism<'_> { ) -> Result { let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; - permit.release_result( - node_info - .connect(ctx, self.allow_self_signed_compute, timeout) - .await, - ) + permit.release_result(node_info.connect(ctx, timeout).await) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 4e5ecda237d5..dbe174cab7d5 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -348,8 +348,6 @@ pub(crate) async fn handle_client( params_compat, params: ¶ms, locks: &config.connect_compute_locks, - // only used for console redirect testing. - allow_self_signed_compute: false, }, &user_info, config.wake_compute_retry_config, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 2553a0c99ab0..9f78ad120b91 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3222,7 +3222,6 @@ def extra_args(self) -> list[str]: # Link auth backend params *["--auth-backend", "link"], *["--uri", NeonProxy.link_auth_uri], - *["--allow-self-signed-compute", "true"], ] class ProxyV1(AuthBackend): From 835287ba3aa1e6a4fea2c1929fbd601de9354218 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 18 Dec 2024 16:29:47 +0000 Subject: [PATCH 40/56] neon_local: add a `flock` to protect against concurrent execution (#10185) ## Problem `neon_local` has always been unsafe to run concurrently with itself: it uses simple text files for persistent state, and concurrent runs will step on each other. In some test environments we intentionally handle this with mutexes in python land, but it's fragile to try and always remember to do that. ## Summary of changes - Add a `flock` based mutex around the `main` function of neon_local, using the repo directory as the file to lock - Clean up an Option<> around control_plane_api, this is a drive-by change because it was one of the fields that had a weird effect when previous concurrent stuff stamped on it. --- control_plane/src/bin/neon_local.rs | 59 ++++++++++++++++--------- control_plane/src/local_env.rs | 10 ++--- control_plane/src/pageserver.rs | 28 ++++++------ control_plane/src/storage_controller.rs | 4 +- 4 files changed, 58 insertions(+), 43 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 1ea443b026a8..c73debae4c40 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -19,6 +19,7 @@ use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; use control_plane::{broker, local_env}; +use nix::fcntl::{flock, FlockArg}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -36,6 +37,8 @@ use safekeeper_api::{ }; use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; +use std::fs::File; +use std::os::fd::AsRawFd; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; @@ -689,6 +692,21 @@ struct TimelineTreeEl { pub children: BTreeSet, } +/// A flock-based guard over the neon_local repository directory +struct RepoLock { + _file: File, +} + +impl RepoLock { + fn new() -> Result { + let repo_dir = File::open(local_env::base_path())?; + let repo_dir_fd = repo_dir.as_raw_fd(); + flock(repo_dir_fd, FlockArg::LockExclusive)?; + + Ok(Self { _file: repo_dir }) + } +} + // Main entry point for the 'neon_local' CLI utility // // This utility helps to manage neon installation. That includes following: @@ -700,9 +718,14 @@ fn main() -> Result<()> { let cli = Cli::parse(); // Check for 'neon init' command first. - let subcommand_result = if let NeonLocalCmd::Init(args) = cli.command { - handle_init(&args).map(|env| Some(Cow::Owned(env))) + let (subcommand_result, _lock) = if let NeonLocalCmd::Init(args) = cli.command { + (handle_init(&args).map(|env| Some(Cow::Owned(env))), None) } else { + // This tool uses a collection of simple files to store its state, and consequently + // it is not generally safe to run multiple commands concurrently. Rather than expect + // all callers to know this, use a lock file to protect against concurrent execution. + let _repo_lock = RepoLock::new().unwrap(); + // all other commands need an existing config let env = LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; let original_env = env.clone(); @@ -728,11 +751,12 @@ fn main() -> Result<()> { NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env), }; - if &original_env != env { + let subcommand_result = if &original_env != env { subcommand_result.map(|()| Some(Cow::Borrowed(env))) } else { subcommand_result.map(|()| None) - } + }; + (subcommand_result, Some(_repo_lock)) }; match subcommand_result { @@ -922,7 +946,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { } else { // User (likely interactive) did not provide a description of the environment, give them the default NeonLocalInitConf { - control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + control_plane_api: Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap()), broker: NeonBroker { listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), }, @@ -1718,18 +1742,15 @@ async fn handle_start_all_impl( broker::start_broker_process(env, &retry_timeout).await }); - // Only start the storage controller if the pageserver is configured to need it - if env.control_plane_api.is_some() { - js.spawn(async move { - let storage_controller = StorageController::from_env(env); - storage_controller - .start(NeonStorageControllerStartArgs::with_default_instance_id( - retry_timeout, - )) - .await - .map_err(|e| e.context("start storage_controller")) - }); - } + js.spawn(async move { + let storage_controller = StorageController::from_env(env); + storage_controller + .start(NeonStorageControllerStartArgs::with_default_instance_id( + retry_timeout, + )) + .await + .map_err(|e| e.context("start storage_controller")) + }); for ps_conf in &env.pageservers { js.spawn(async move { @@ -1774,10 +1795,6 @@ async fn neon_start_status_check( const RETRY_INTERVAL: Duration = Duration::from_millis(100); const NOTICE_AFTER_RETRIES: Duration = Duration::from_secs(5); - if env.control_plane_api.is_none() { - return Ok(()); - } - let storcon = StorageController::from_env(env); let retries = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis(); diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 032c88a829f7..489f9c850963 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -76,7 +76,7 @@ pub struct LocalEnv { // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - pub control_plane_api: Option, + pub control_plane_api: Url, // Control plane upcall API for storage controller. If set, this will be propagated into the // storage controller's configuration. @@ -133,7 +133,7 @@ pub struct NeonLocalInitConf { pub storage_controller: Option, pub pageservers: Vec, pub safekeepers: Vec, - pub control_plane_api: Option>, + pub control_plane_api: Option, pub control_plane_compute_hook_api: Option>, } @@ -535,7 +535,7 @@ impl LocalEnv { storage_controller, pageservers, safekeepers, - control_plane_api, + control_plane_api: control_plane_api.unwrap(), control_plane_compute_hook_api, branch_name_mappings, } @@ -638,7 +638,7 @@ impl LocalEnv { storage_controller: self.storage_controller.clone(), pageservers: vec![], // it's skip_serializing anyway safekeepers: self.safekeepers.clone(), - control_plane_api: self.control_plane_api.clone(), + control_plane_api: Some(self.control_plane_api.clone()), control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), branch_name_mappings: self.branch_name_mappings.clone(), }, @@ -768,7 +768,7 @@ impl LocalEnv { storage_controller: storage_controller.unwrap_or_default(), pageservers: pageservers.iter().map(Into::into).collect(), safekeepers, - control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_api: control_plane_api.unwrap(), control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), branch_name_mappings: Default::default(), }; diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 9d3f0183458f..ef5b3d65934c 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -95,21 +95,19 @@ impl PageServerNode { let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; - if let Some(control_plane_api) = &self.env.control_plane_api { - overrides.push(format!( - "control_plane_api='{}'", - control_plane_api.as_str() - )); - - // Storage controller uses the same auth as pageserver: if JWT is enabled - // for us, we will also need it to talk to them. - if matches!(conf.http_auth_type, AuthType::NeonJWT) { - let jwt_token = self - .env - .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) - .unwrap(); - overrides.push(format!("control_plane_api_token='{}'", jwt_token)); - } + overrides.push(format!( + "control_plane_api='{}'", + self.env.control_plane_api.as_str() + )); + + // Storage controller uses the same auth as pageserver: if JWT is enabled + // for us, we will also need it to talk to them. + if matches!(conf.http_auth_type, AuthType::NeonJWT) { + let jwt_token = self + .env + .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) + .unwrap(); + overrides.push(format!("control_plane_api_token='{}'", jwt_token)); } if !conf.other.contains_key("remote_storage") { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index b70bd2e1b5ef..22d2420ed4dc 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -338,7 +338,7 @@ impl StorageController { .port(), ) } else { - let listen_url = self.env.control_plane_api.clone().unwrap(); + let listen_url = self.env.control_plane_api.clone(); let listen = format!( "{}:{}", @@ -708,7 +708,7 @@ impl StorageController { } else { // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out // for general purpose API access. - let listen_url = self.env.control_plane_api.clone().unwrap(); + let listen_url = self.env.control_plane_api.clone(); Url::from_str(&format!( "http://{}:{}/{path}", listen_url.host_str().unwrap(), From 3d1c3a80ae0fd2babe42d7fc2293cb2a058ff8cb Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 18 Dec 2024 13:09:02 -0500 Subject: [PATCH 41/56] feat(pageserver): add compact queue http endpoint (#10173) ## Problem We cannot get the size of the compaction queue and access the info. Part of #9114 ## Summary of changes * Add an API endpoint to get the compaction queue. * gc_compaction test case now waits until the compaction finishes. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/models.rs | 64 +++++++++++++++++++++++++ pageserver/src/http/routes.rs | 36 +++++++++++++- pageserver/src/tenant.rs | 23 +++++++-- pageserver/src/tenant/timeline.rs | 63 ++---------------------- test_runner/fixtures/pageserver/http.py | 13 ++++- test_runner/regress/test_compaction.py | 6 +++ 6 files changed, 139 insertions(+), 66 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5690b643f062..f3fc9fad760a 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -6,6 +6,7 @@ pub mod utilization; use camino::Utf8PathBuf; pub use utilization::PageserverUtilization; +use core::ops::Range; use std::{ collections::HashMap, fmt::Display, @@ -28,6 +29,7 @@ use utils::{ }; use crate::{ + key::Key, reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, }; @@ -210,6 +212,68 @@ pub enum TimelineState { Broken { reason: String, backtrace: String }, } +#[serde_with::serde_as] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct CompactLsnRange { + pub start: Lsn, + pub end: Lsn, +} + +#[serde_with::serde_as] +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct CompactKeyRange { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub start: Key, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub end: Key, +} + +impl From> for CompactLsnRange { + fn from(range: Range) -> Self { + Self { + start: range.start, + end: range.end, + } + } +} + +impl From> for CompactKeyRange { + fn from(range: Range) -> Self { + Self { + start: range.start, + end: range.end, + } + } +} + +impl From for Range { + fn from(range: CompactLsnRange) -> Self { + range.start..range.end + } +} + +impl From for Range { + fn from(range: CompactKeyRange) -> Self { + range.start..range.end + } +} + +impl CompactLsnRange { + pub fn above(lsn: Lsn) -> Self { + Self { + start: lsn, + end: Lsn::MAX, + } + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct CompactInfoResponse { + pub compact_key_range: Option, + pub compact_lsn_range: Option, + pub sub_compaction: bool, +} + #[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub new_timeline_id: TimelineId, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index db7d29385641..60ef4c3702f4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::DEFAULT_PG_VERSION; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, - TimelineInfo, + CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, + TimelineGcRequest, TimelineInfo, }; use utils::{ auth::SwappableJwtAuth, @@ -2039,6 +2039,34 @@ async fn timeline_cancel_compact_handler( .await } +// Get compact info of a timeline +async fn timeline_compact_info_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + let res = tenant.get_scheduled_compaction_tasks(timeline_id); + let mut resp = Vec::new(); + for item in res { + resp.push(CompactInfoResponse { + compact_key_range: item.compact_key_range, + compact_lsn_range: item.compact_lsn_range, + sub_compaction: item.sub_compaction, + }); + } + json_response(StatusCode::OK, resp) + } + .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await +} + // Run compaction immediately on given timeline. async fn timeline_compact_handler( mut request: Request, @@ -3400,6 +3428,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", |r| api_handler(r, timeline_gc_handler), ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", + |r| api_handler(r, timeline_compact_info_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_handler), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 99289d5f15f7..2e4c47c6e40f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3122,6 +3122,23 @@ impl Tenant { } } + pub(crate) fn get_scheduled_compaction_tasks( + &self, + timeline_id: TimelineId, + ) -> Vec { + use itertools::Itertools; + let guard = self.scheduled_compaction_tasks.lock().unwrap(); + guard + .get(&timeline_id) + .map(|tline_pending_tasks| { + tline_pending_tasks + .iter() + .map(|x| x.options.clone()) + .collect_vec() + }) + .unwrap_or_default() + } + /// Schedule a compaction task for a timeline. pub(crate) async fn schedule_compaction( &self, @@ -5759,13 +5776,13 @@ mod tests { use timeline::{CompactOptions, DeltaLayerTestDesc}; use utils::id::TenantId; + #[cfg(feature = "testing")] + use models::CompactLsnRange; #[cfg(feature = "testing")] use pageserver_api::record::NeonWalRecord; #[cfg(feature = "testing")] use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; #[cfg(feature = "testing")] - use timeline::CompactLsnRange; - #[cfg(feature = "testing")] use timeline::GcInfo; static TEST_KEY: Lazy = @@ -9634,7 +9651,7 @@ mod tests { #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { - use timeline::CompactLsnRange; + use models::CompactLsnRange; let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; let (tenant, ctx) = harness.load().await; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 87f5a0338252..e71cb4db80b9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -31,9 +31,9 @@ use pageserver_api::{ }, keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, models::{ - CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, - DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, - LsnLease, TimelineState, + CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, + InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, }, reltag::BlockNumber, shard::{ShardIdentity, ShardNumber, TenantShardId}, @@ -788,63 +788,6 @@ pub(crate) struct CompactRequest { pub sub_compaction_max_job_size_mb: Option, } -#[serde_with::serde_as] -#[derive(Debug, Clone, serde::Deserialize)] -pub(crate) struct CompactLsnRange { - pub start: Lsn, - pub end: Lsn, -} - -#[serde_with::serde_as] -#[derive(Debug, Clone, serde::Deserialize)] -pub(crate) struct CompactKeyRange { - #[serde_as(as = "serde_with::DisplayFromStr")] - pub start: Key, - #[serde_as(as = "serde_with::DisplayFromStr")] - pub end: Key, -} - -impl From> for CompactLsnRange { - fn from(range: Range) -> Self { - Self { - start: range.start, - end: range.end, - } - } -} - -impl From> for CompactKeyRange { - fn from(range: Range) -> Self { - Self { - start: range.start, - end: range.end, - } - } -} - -impl From for Range { - fn from(range: CompactLsnRange) -> Self { - range.start..range.end - } -} - -impl From for Range { - fn from(range: CompactKeyRange) -> Self { - range.start..range.end - } -} - -impl CompactLsnRange { - #[cfg(test)] - #[cfg(feature = "testing")] - pub fn above(lsn: Lsn) -> Self { - Self { - start: lsn, - end: Lsn::MAX, - } - } -} - #[derive(Debug, Clone, Default)] pub(crate) struct CompactOptions { pub flags: EnumSet, diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index eabdeb10539c..378e56862252 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -738,6 +738,18 @@ def timeline_offload( res_json = res.json() assert res_json is None + def timeline_compact_info( + self, + tenant_id: TenantId | TenantShardId, + timeline_id: TimelineId, + ) -> Any: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_compact( self, tenant_id: TenantId | TenantShardId, @@ -749,7 +761,6 @@ def timeline_compact( enhanced_gc_bottom_most_compaction=False, body: dict[str, Any] | None = None, ): - self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index aef9a825ee36..ae48a8fc27d0 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -176,6 +176,12 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder): workload.churn_rows(row_count, env.pageserver.id) + def compaction_finished(): + queue_depth = len(ps_http.timeline_compact_info(tenant_id, timeline_id)) + assert queue_depth == 0 + + wait_until(compaction_finished, timeout=60) + # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked) env.pageserver.assert_log_contains( "scheduled_compact_timeline.*picked .* layers for compaction" From 6d3e8096fcad394d387b59fa300f07fb19613760 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 18 Dec 2024 13:10:05 -0500 Subject: [PATCH 42/56] refactor(test): tighten up test_gc_feedback (#10126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem In https://github.com/neondatabase/neon/pull/8103 we changed the test case to have more test coverage of gc_compaction. Now that we have `test_gc_compaction_smoke`, we can revert this test case to serve its original purpose and revert the parameter changes. part of https://github.com/neondatabase/neon/issues/9114 ## Summary of changes * Revert pitr_interval from 60s to 10s. * Assert the physical/logical size ratio in the benchmark. --------- Signed-off-by: Alex Chi Z Co-authored-by: Arpad Müller --- test_runner/performance/test_gc_feedback.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 07f244da0cef..acb7b56fd073 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -22,7 +22,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma "checkpoint_distance": f"{1024 ** 2}", "compaction_target_size": f"{1024 ** 2}", # set PITR interval to be small, so we can do GC - "pitr_interval": "60 s", + "pitr_interval": "10 s", # "compaction_threshold": "3", # "image_creation_threshold": "2", } @@ -32,6 +32,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma n_steps = 10 n_update_iters = 100 step_size = 10000 + branch_created = 0 with endpoint.cursor() as cur: cur.execute("SET statement_timeout='1000s'") cur.execute( @@ -66,6 +67,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma if mode == "with_snapshots": if step == n_steps / 2: env.create_branch("child") + branch_created += 1 max_num_of_deltas_above_image = 0 max_total_num_of_deltas = 0 @@ -142,6 +144,15 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma with layer_map_path.open("w") as f: f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) + # We should have collected all garbage + if mode == "normal": + # in theory we should get physical size ~= logical size, but given that gc interval is 10s, + # and the layer has indexes that might contribute to the fluctuation, we allow a small margin + # of 1 here, and the end ratio we are asserting is 1 (margin) + 1 (expected) = 2. + assert physical_size / logical_size < 2 + elif mode == "with_snapshots": + assert physical_size / logical_size < (2 + branch_created) + @pytest.mark.timeout(10000) def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): From 61fcf64c22b7464aa9beb5f22ec9ded96891a12f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 18 Dec 2024 21:15:38 +0200 Subject: [PATCH 43/56] Fix flukyness of test_physical_and_logical_replicaiton.py (#10176) ## Problem See https://github.com/neondatabase/neon/issues/10037 test_physical_and_logical_replication.py sometimes failed. ## Summary of changes Add `wait_replica_caughtup` to wait for replica sync Co-authored-by: Konstantin Knizhnik --- test_runner/regress/test_physical_and_logical_replicaiton.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py index ad2d0871b82c..3f9824ee677a 100644 --- a/test_runner/regress/test_physical_and_logical_replicaiton.py +++ b/test_runner/regress/test_physical_and_logical_replicaiton.py @@ -2,7 +2,7 @@ import time -from fixtures.neon_fixtures import NeonEnv, logical_replication_sync +from fixtures.neon_fixtures import NeonEnv, logical_replication_sync, wait_replica_caughtup def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg): @@ -38,6 +38,8 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE for pk in range(n_records): p_cur.execute("insert into t (pk) values (%s)", (pk,)) + wait_replica_caughtup(primary, secondary) + s_cur.execute("select count(*) from t") assert s_cur.fetchall()[0][0] == n_records From cc138b56f983c8fc66e737c5e02898121fdd72ec Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 19 Dec 2024 04:45:06 -0500 Subject: [PATCH 44/56] fix(pageserver): run psql in thread to avoid blocking (#10177) ## Problem ref https://github.com/neondatabase/neon/issues/10170 ref https://github.com/neondatabase/neon/issues/9994 The psql command will block the main thread, causing other async tasks to timeout (i.e., HTTP connect). Therefore, we need to move it to an I/O executor thread. ## Summary of changes * run psql connection in a thread --------- Signed-off-by: Alex Chi Z Co-authored-by: John Spray --- .../regress/test_pageserver_layer_rolling.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 706da1e35e00..fcc465f90a55 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -22,7 +22,10 @@ async def run_worker_for_tenant( - env: NeonEnv, entries: int, tenant: TenantId, offset: int | None = None + env: NeonEnv, + entries: int, + tenant: TenantId, + offset: int | None = None, ) -> Lsn: if offset is None: offset = 0 @@ -37,12 +40,20 @@ async def run_worker_for_tenant( finally: await conn.close(timeout=10) - last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + loop = asyncio.get_running_loop() + sql = await loop.run_in_executor( + None, lambda ep: ep.safe_psql("SELECT pg_current_wal_flush_lsn()"), ep + ) + last_flush_lsn = Lsn(sql[0][0]) return last_flush_lsn async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.create_tenant(conf=tenant_conf) + loop = asyncio.get_running_loop() + # capture tenant_conf by specifying `tenant_conf=tenant_conf`, otherwise it will be evaluated to some random value + tenant, timeline = await loop.run_in_executor( + None, lambda tenant_conf, env: env.create_tenant(conf=tenant_conf), tenant_conf, env + ) last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) return tenant, timeline, last_flush_lsn From a1b0558493930dca4f5c86db658ad295b9beabd6 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 19 Dec 2024 11:04:17 +0100 Subject: [PATCH 45/56] fast import: importer: use aws s3 cli (#10162) ## Problem s5cmd doesn't pick up the pod service account ``` 2024/12/16 16:26:01 Ignoring, HTTP credential provider invalid endpoint host, "169.254.170.23", only loopback hosts are allowed. ERROR "ls s3://neon-dev-bulk-import-us-east-2/import-pgdata/fast-import/v1/br-wandering-hall-w2xobawv": NoCredentialProviders: no valid providers in chain. Deprecated. For verbose messaging see aws.Config.CredentialsChainVerboseErrors ``` ## Summary of changes Switch to offical CLI. ## Testing Tested the pre-merge image in staging, using `job_image` override in project settings. https://neondb.slack.com/archives/C033RQ5SPDH/p1734554944391949?thread_ts=1734368383.258759&cid=C033RQ5SPDH ## Future Work Switch back to s5cmd once https://github.com/peak/s5cmd/pull/769 gets merged. ## Refs - fixes https://github.com/neondatabase/cloud/issues/21876 --------- Co-authored-by: Gleb Novikov --- compute/compute-node.Dockerfile | 24 ++++++++++--------- compute_tools/src/bin/fast_import.rs | 10 ++++---- .../fast_import/{s5cmd.rs => aws_s3_sync.rs} | 13 ++++------ 3 files changed, 23 insertions(+), 24 deletions(-) rename compute_tools/src/bin/fast_import/{s5cmd.rs => aws_s3_sync.rs} (50%) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 9f1f3b734363..5e7b4e8287ce 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1556,28 +1556,30 @@ RUN apt update && \ locales \ procps \ ca-certificates \ + curl \ + unzip \ $VERSION_INSTALLS && \ apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 -# s5cmd 2.2.2 from https://github.com/peak/s5cmd/releases/tag/v2.2.2 -# used by fast_import +# aws cli is used by fast_import (curl and unzip above are at this time only used for this installation step) ARG TARGETARCH -ADD https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_linux_$TARGETARCH.deb /tmp/s5cmd.deb RUN set -ex; \ - \ - # Determine the expected checksum based on TARGETARCH if [ "${TARGETARCH}" = "amd64" ]; then \ - CHECKSUM="392c385320cd5ffa435759a95af77c215553d967e4b1c0fffe52e4f14c29cf85"; \ + TARGETARCH_ALT="x86_64"; \ + CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \ elif [ "${TARGETARCH}" = "arm64" ]; then \ - CHECKSUM="939bee3cf4b5604ddb00e67f8c157b91d7c7a5b553d1fbb6890fad32894b7b46"; \ + TARGETARCH_ALT="aarch64"; \ + CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \ else \ echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \ fi; \ - \ - # Compute and validate the checksum - echo "${CHECKSUM} /tmp/s5cmd.deb" | sha256sum -c - -RUN dpkg -i /tmp/s5cmd.deb && rm /tmp/s5cmd.deb + curl -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \ + echo "${CHECKSUM} /tmp/awscliv2.zip" | sha256sum -c -; \ + unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \ + /tmp/awscliv2/aws/install; \ + rm -rf /tmp/awscliv2.zip /tmp/awscliv2; \ + true ENV LANG=en_US.utf8 USER postgres diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index b6db3eb11abc..793ec4cf1094 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -34,12 +34,12 @@ use nix::unistd::Pid; use tracing::{info, info_span, warn, Instrument}; use utils::fs_ext::is_directory_empty; +#[path = "fast_import/aws_s3_sync.rs"] +mod aws_s3_sync; #[path = "fast_import/child_stdio_to_log.rs"] mod child_stdio_to_log; #[path = "fast_import/s3_uri.rs"] mod s3_uri; -#[path = "fast_import/s5cmd.rs"] -mod s5cmd; #[derive(clap::Parser)] struct Args { @@ -326,7 +326,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { } info!("upload pgdata"); - s5cmd::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/")) + aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/")) .await .context("sync dump directory to destination")?; @@ -334,10 +334,10 @@ pub(crate) async fn main() -> anyhow::Result<()> { { let status_dir = working_directory.join("status"); std::fs::create_dir(&status_dir).context("create status directory")?; - let status_file = status_dir.join("status"); + let status_file = status_dir.join("pgdata"); std::fs::write(&status_file, serde_json::json!({"done": true}).to_string()) .context("write status file")?; - s5cmd::sync(&status_file, &s3_prefix.append("/status/pgdata")) + aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/")) .await .context("sync status directory to destination")?; } diff --git a/compute_tools/src/bin/fast_import/s5cmd.rs b/compute_tools/src/bin/fast_import/aws_s3_sync.rs similarity index 50% rename from compute_tools/src/bin/fast_import/s5cmd.rs rename to compute_tools/src/bin/fast_import/aws_s3_sync.rs index d2d9a79736fe..5fa58c8f875f 100644 --- a/compute_tools/src/bin/fast_import/s5cmd.rs +++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs @@ -4,24 +4,21 @@ use camino::Utf8Path; use super::s3_uri::S3Uri; pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> { - let mut builder = tokio::process::Command::new("s5cmd"); - // s5cmd uses aws-sdk-go v1, hence doesn't support AWS_ENDPOINT_URL - if let Some(val) = std::env::var_os("AWS_ENDPOINT_URL") { - builder.arg("--endpoint-url").arg(val); - } + let mut builder = tokio::process::Command::new("aws"); builder + .arg("s3") .arg("sync") .arg(local.as_str()) .arg(remote.to_string()); let st = builder .spawn() - .context("spawn s5cmd")? + .context("spawn aws s3 sync")? .wait() .await - .context("wait for s5cmd")?; + .context("wait for aws s3 sync")?; if st.success() { Ok(()) } else { - Err(anyhow::anyhow!("s5cmd failed")) + Err(anyhow::anyhow!("aws s3 sync failed")) } } From 43dc03459d42ec4c7ca028e48f5a0d8994ecf983 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Thu, 19 Dec 2024 11:25:44 +0100 Subject: [PATCH 46/56] Run pgbench on 10 GB scale factor on database with n relations (e.g. 10k) (#10172) ## Problem We want to verify how much / if pgbench throughput and latency on Neon suffers if the database contains many other relations, too. ## Summary of changes Modify the benchmarking.yml pgbench-compare job to - create an addiitional project at scale factor 10 GiB - before running pgbench add n tables (initially 10k) to the database - then compare the pgbench throughput and latency to the existing pgbench-compare at 10 Gib scale factor We use a realistic template for the n relations that is a partitioned table with some realistic data types, indexes and constraints - similar to a table that we use internally. Example run: https://github.com/neondatabase/neon/actions/runs/12377565956/job/34547386959 --- .github/workflows/benchmarking.yml | 25 ++- .../many_relations/create_many_relations.sql | 199 ++++++++++++++++++ .../performance/test_perf_many_relations.py | 66 ++++++ 3 files changed, 288 insertions(+), 2 deletions(-) create mode 100644 test_runner/performance/many_relations/create_many_relations.sql create mode 100644 test_runner/performance/test_perf_many_relations.py diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index bbdcf5ef493f..ab0f2a615517 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -308,6 +308,7 @@ jobs: "image": [ "'"$image_default"'" ], "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, @@ -410,7 +411,7 @@ jobs: aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project - if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) + if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -429,7 +430,7 @@ jobs: neonvm-captest-sharding-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} ;; - neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) + neonvm-captest-new | neonvm-captest-new-many-tables | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -446,6 +447,26 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + # we want to compare Neon project OLTP throughput and latency at scale factor 10 GB + # without (neonvm-captest-new) + # and with (neonvm-captest-new-many-tables) many relations in the database + - name: Create many relations before the run + if: contains(fromJson('["neonvm-captest-new-many-tables"]'), matrix.platform) + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_perf_many_relations + pg_version: ${{ env.DEFAULT_PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + TEST_NUM_RELATIONS: 10000 + - name: Benchmark init uses: ./.github/actions/run-python-test-set with: diff --git a/test_runner/performance/many_relations/create_many_relations.sql b/test_runner/performance/many_relations/create_many_relations.sql new file mode 100644 index 000000000000..1b3673c9e185 --- /dev/null +++ b/test_runner/performance/many_relations/create_many_relations.sql @@ -0,0 +1,199 @@ +-- create a schema that simulates Neon control plane operations table +-- however use partitioned operations tables with many (e.g. 500) child partition tables per table +-- in summary we create multiple of these partitioned operations tables (with 500 childs each) - until we reach the requested number of tables + + +-- first we need some other tables that can be referenced by the operations table + +-- Table for branches +CREATE TABLE public.branches ( + id text PRIMARY KEY +); + +-- Table for endpoints +CREATE TABLE public.endpoints ( + id text PRIMARY KEY +); + +-- Table for projects +CREATE TABLE public.projects ( + id text PRIMARY KEY +); + +INSERT INTO public.branches (id) +VALUES ('branch_1'); + +-- Insert one row into endpoints +INSERT INTO public.endpoints (id) +VALUES ('endpoint_1'); + +-- Insert one row into projects +INSERT INTO public.projects (id) +VALUES ('project_1'); + +-- now we create a procedure that can create n operations tables +-- we do that in a procedure to save roundtrip latency when scaling the test to many tables +-- prefix is the base table name, e.g. 'operations_scale_1000' if we create 1000 tables +CREATE OR REPLACE PROCEDURE create_partitioned_tables(prefix text, n INT) +LANGUAGE plpgsql AS $$ +DECLARE + table_name TEXT; -- Variable to hold table names dynamically + i INT; -- Counter for the loop +BEGIN + -- Loop to create n partitioned tables + FOR i IN 1..n LOOP + table_name := format('%s_%s', prefix, i); + + -- Create the partitioned table + EXECUTE format( + 'CREATE TABLE public.%s ( + project_id character varying NOT NULL, + id uuid NOT NULL, + status integer, + action character varying NOT NULL, + error character varying, + created_at timestamp with time zone NOT NULL DEFAULT now(), + updated_at timestamp with time zone NOT NULL DEFAULT now(), + spec jsonb, + retry_at timestamp with time zone, + failures_count integer DEFAULT 0, + metadata jsonb NOT NULL DEFAULT ''{}''::jsonb, + executor_id text NOT NULL, + attempt_duration_ms integer, + metrics jsonb DEFAULT ''{}''::jsonb, + branch_id text, + endpoint_id text, + next_operation_id uuid, + compute_id text, + connection_attempt_at timestamp with time zone, + concurrency_key text, + queue_id text, + CONSTRAINT %s_pkey PRIMARY KEY (id, created_at), + CONSTRAINT %s_branch_id_fk FOREIGN KEY (branch_id) REFERENCES branches(id) ON DELETE CASCADE, + CONSTRAINT %s_endpoint_id_fk FOREIGN KEY (endpoint_id) REFERENCES endpoints(id) ON DELETE CASCADE, + CONSTRAINT %s_next_operation_id_fk FOREIGN KEY (next_operation_id, created_at) REFERENCES %s(id, created_at), + CONSTRAINT %s_project_id_fk FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE + ) PARTITION BY RANGE (created_at)', + table_name, table_name, table_name, table_name, table_name, table_name, table_name + ); + + -- Add indexes for the partitioned table + EXECUTE format('CREATE INDEX index_%s_on_next_operation_id ON public.%s (next_operation_id)', table_name, table_name); + EXECUTE format('CREATE INDEX index_%s_on_project_id ON public.%s (project_id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_branch_id ON public.%s (branch_id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_branch_id_created_idx ON public.%s (branch_id, created_at)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_created_at_idx ON public.%s (created_at)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_created_at_project_id_id_cond_idx ON public.%s (created_at, project_id, id)', table_name, table_name); + EXECUTE format('CREATE INDEX %s_endpoint_id ON public.%s (endpoint_id)', table_name, table_name); + EXECUTE format( + 'CREATE INDEX %s_for_redo_worker_idx ON public.%s (executor_id) WHERE status <> 1', + table_name, table_name + ); + EXECUTE format( + 'CREATE INDEX %s_project_id_status_index ON public.%s ((project_id::text), status)', + table_name, table_name + ); + EXECUTE format( + 'CREATE INDEX %s_status_not_finished ON public.%s (status) WHERE status <> 1', + table_name, table_name + ); + EXECUTE format('CREATE INDEX %s_updated_at_desc_idx ON public.%s (updated_at DESC)', table_name, table_name); + EXECUTE format( + 'CREATE INDEX %s_with_failures ON public.%s (failures_count) WHERE failures_count > 0', + table_name, table_name + ); + END LOOP; +END; +$$; + +-- next we create a procedure that can add the child partitions (one per day) to each of the operations tables +CREATE OR REPLACE PROCEDURE create_operations_partitions( + table_name TEXT, + start_date DATE, + end_date DATE +) +LANGUAGE plpgsql AS $$ +DECLARE + partition_date DATE; + partition_name TEXT; + counter INT := 0; -- Counter to track the number of tables created in the current transaction +BEGIN + partition_date := start_date; + + -- Create partitions in batches + WHILE partition_date < end_date LOOP + partition_name := format('%s_%s', table_name, to_char(partition_date,'YYYY_MM_DD')); + + EXECUTE format( + 'CREATE TABLE IF NOT EXISTS public.%s PARTITION OF public.%s + FOR VALUES FROM (''%s'') TO (''%s'')', + partition_name, + table_name, + partition_date, + partition_date + INTERVAL '1 day' + ); + + counter := counter + 1; + + -- Commit and reset counter after every 100 partitions + IF counter >= 100 THEN + COMMIT; + counter := 0; -- Reset the counter + END IF; + + -- Advance to the next day + partition_date := partition_date + INTERVAL '1 day'; + END LOOP; + + -- Final commit for remaining partitions + IF counter > 0 THEN + COMMIT; + END IF; + + -- Insert synthetic rows into each partition + EXECUTE format( + 'INSERT INTO %I ( + project_id, + branch_id, + endpoint_id, + id, + status, + action, + created_at, + updated_at, + spec, + metadata, + executor_id, + failures_count + ) + SELECT + ''project_1'', -- project_id + ''branch_1'', -- branch_id + ''endpoint_1'', -- endpoint_id + ''e8bba687-0df9-4291-bfcd-7d5f6aa7c158'', -- unique id + 1, -- status + ''SYNTHETIC_ACTION'', -- action + gs::timestamp + interval ''0 ms'', -- created_at + gs::timestamp + interval ''1 minute'', -- updated_at + ''{"key": "value"}'', -- spec (JSONB) + ''{"metadata_key": "metadata_value"}'', -- metadata (JSONB) + ''executor_1'', -- executor_id + 0 -- failures_count + FROM generate_series(%L, %L::DATE - INTERVAL ''1 day'', INTERVAL ''1 day'') AS gs', + table_name, start_date, end_date + ); + + -- Commit the inserted rows + COMMIT; +END; +$$; + +-- we can now create partitioned tables using something like +-- CALL create_partitioned_tables('operations_scale_1000' ,10); + +-- and we can create the child partitions for a table using something like +-- CALL create_operations_partitions( +-- 'operations_scale_1000_1', +-- '2000-01-01', -- Start date +-- ('2000-01-01'::DATE + INTERVAL '1 day' * 500)::DATE -- End date (start date + number of days) +-- ); diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py new file mode 100644 index 000000000000..0ee0efe8b9c1 --- /dev/null +++ b/test_runner/performance/test_perf_many_relations.py @@ -0,0 +1,66 @@ +import os +from pathlib import Path + +import pytest +from fixtures.compare_fixtures import RemoteCompare +from fixtures.log_helper import log + + +def get_num_relations(default: int = 1000) -> list[int]: + # We parametrize each run with scale specifying the number of wanted child partitions. + # Databases are pre-created and passed through BENCHMARK_CONNSTR env variable. + scales = os.getenv("TEST_NUM_RELATIONS", default=str(default)) + rv = [] + for s in scales.split(","): + scale = int(s) + rv.append(scale) + return rv + + +@pytest.mark.parametrize("num_relations", get_num_relations()) +@pytest.mark.remote_cluster +def test_perf_many_relations(remote_compare: RemoteCompare, num_relations: int): + """ + Test creating many relations in a single database. + We use partitioned tables with child tables, indexes and constraints to have a realistic schema. + Also we include some common data types like text, uuid, timestamp, JSONB, etc. + + see many_relations/create_many_relations.sql + """ + env = remote_compare + + # prepare some base tables and the plpgsql procedures that we use to create the tables + sql_file = Path(__file__).parent / "many_relations" / "create_many_relations.sql" + env.pg_bin.run_capture(["psql", env.pg.connstr(), "-f", str(sql_file)]) + + num_parent_tables = num_relations // 500 + 1 + log.info(f"Creating {num_relations} relations in {num_parent_tables} parent tables") + + log.info(f"Creating {num_parent_tables} parent tables") + sql = f"CALL create_partitioned_tables('operations_scale_{num_relations}', {num_parent_tables})" + log.info(sql) + env.pg_bin.run_capture(["psql", env.pg.connstr(), "-c", sql]) + + current_table = 0 + num_relations_remaining = num_relations + + # now run and measure the actual relation creation + while num_relations_remaining > 0: + current_table += 1 + parent_table_name = f"operations_scale_{num_relations}_{current_table}" + if num_relations_remaining > 500: + num_relations_to_create = 500 + else: + num_relations_to_create = num_relations_remaining + num_relations_remaining -= num_relations_to_create + log.info( + f"Creating {num_relations_to_create} child tables in partitioned parent table '{parent_table_name}'" + ) + sql = f"CALL create_operations_partitions( '{parent_table_name}', '2000-01-01', ('2000-01-01'::DATE + INTERVAL '1 day' * {num_relations_to_create})::DATE)" + log.info(sql) + with env.zenbenchmark.record_duration( + f"CREATE_TABLE/{current_table}/{num_relations_to_create}" + ): + env.pg_bin.run_capture( + ["psql", env.pg.connstr(options="-cstatement_timeout=1000s "), "-c", sql] + ) From b135194090369d8e5452c9ee1c6e7c37cc9ba8bd Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Thu, 19 Dec 2024 11:37:08 +0100 Subject: [PATCH 47/56] proxy: Delay SASL complete message until auth is done (#10189) The final SASL complete message can be bundled with the remainder of the auth flow messages until ReadyForQuery. neondatabase/cloud#19184 --- proxy/src/auth/backend/mod.rs | 3 +++ proxy/src/sasl/stream.rs | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 50cb94bfa069..0c9a7f7825de 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -678,6 +678,9 @@ mod tests { .await .unwrap(); + // flush the final server message + stream.flush().await.unwrap(); + handle.await.unwrap(); } diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index f1c916daa2b7..ac7755656683 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -50,6 +50,12 @@ impl SaslStream<'_, S> { self.stream.write_message(&msg.to_reply()).await?; Ok(()) } + + // Queue a SASL message for the client. + fn send_noflush(&mut self, msg: &ServerMessage<&str>) -> io::Result<()> { + self.stream.write_message_noflush(&msg.to_reply())?; + Ok(()) + } } /// SASL authentication outcome. @@ -85,7 +91,7 @@ impl SaslStream<'_, S> { continue; } Step::Success(result, reply) => { - self.send(&ServerMessage::Final(&reply)).await?; + self.send_noflush(&ServerMessage::Final(&reply))?; Outcome::Success(result) } Step::Failure(reason) => Outcome::Failure(reason), From 65042cbadd0426c43499bb7675e671b5c6e980e9 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 19 Dec 2024 10:58:49 +0000 Subject: [PATCH 48/56] tests: use high IO concurrency in `test_pgdata_import_smoke`, use `effective_io_concurrency=2` in tests by default (#10114) ## Problem `test_pgdata_import_smoke` writes two gigabytes of pages and then reads them back serially. This is CPU bottlenecked and results in a long runtime, and sensitivity to CPU load from other tests on the same machine. Closes: https://github.com/neondatabase/neon/issues/10071 ## Summary of changes - Use effective_io_concurrency=32 when doing sequential scans through 2GiB of pages in test_pgdata_import_smoke. This is a ~10x runtime decrease in the parts of the test that do sequential scans. - Also set `effective_io_concurrency=2` for tests, as I noticed while debugging that we were doing all getpage requests serially, which is bad for checking the stability of the batching code. --- control_plane/src/endpoint.rs | 4 ++++ test_runner/regress/test_import_pgdata.py | 14 +++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 1fdf32605161..5ebf842813f1 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -316,6 +316,10 @@ impl Endpoint { // and can cause errors like 'no unpinned buffers available', see // conf.append("shared_buffers", "1MB"); + // Postgres defaults to effective_io_concurrency=1, which does not exercise the pageserver's + // batching logic. Set this to 2 so that we exercise the code a bit without letting + // individual tests do a lot of concurrent work on underpowered test machines + conf.append("effective_io_concurrency", "2"); conf.append("fsync", "off"); conf.append("max_connections", "100"); conf.append("wal_level", "logical"); diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index 29229b73c156..6ea2393a9d81 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -84,6 +84,8 @@ def handler(request: Request) -> Response: elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: + # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data + # to exercise multiple segments. target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) else: raise ValueError @@ -111,9 +113,15 @@ def handler(request: Request) -> Response: def validate_vanilla_equivalence(ep): # TODO: would be nicer to just compare pgdump - assert ep.safe_psql("select count(*), sum(data::bigint)::bigint from t") == [ - (expect_nrows, expect_sum) - ] + + # Enable IO concurrency for batching on large sequential scan, to avoid making + # this test unnecessarily onerous on CPU + assert ep.safe_psql_many( + [ + "set effective_io_concurrency=32;", + "select count(*), sum(data::bigint)::bigint from t", + ] + ) == [[], [(expect_nrows, expect_sum)]] validate_vanilla_equivalence(vanilla_pg) From afda6d4700ca0b521ecb412f0bc81e80e5903dbd Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 19 Dec 2024 12:55:05 +0000 Subject: [PATCH 49/56] storage_scrubber: don't report half-created timelines as corruption (#10198) ## Problem test_timeline_archival_chaos does timeline creation with failure injection, and thereby sometimes leaves timelines in a part created state. This was being reported as corruption by the scrubber on test teardown, because it considered a layer without an index to be an invalid state. This was incorrect: the scrubber should accept this state, it occurs legitimately during timeline creation. Closes: https://github.com/neondatabase/neon/issues/9988 ## Summary of changes - Report a timeline with layers but no index as Relic rather than MissingIndexPart. - We retain the MissingIndexPart variant for the case where an index _was_ found in the listing, but was not found by a subsequent GET, i.e. racing with deletion. --- storage_scrubber/src/checks.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index f759f54d19b7..32c86052efaa 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -310,7 +310,7 @@ pub(crate) enum BlobDataParseResult { index_part_generation: Generation, s3_layers: HashSet<(LayerName, Generation)>, }, - /// The remains of a deleted Timeline (i.e. an initdb archive only) + /// The remains of an uncleanly deleted Timeline or aborted timeline creation(e.g. an initdb archive only, or some layer without an index) Relic, Incorrect { errors: Vec, @@ -346,7 +346,7 @@ pub(crate) async fn list_timeline_blobs( match res { ListTimelineBlobsResult::Ready(data) => Ok(data), ListTimelineBlobsResult::MissingIndexPart(_) => { - // Retry if index is missing. + // Retry if listing raced with removal of an index let data = list_timeline_blobs_impl(remote_client, id, root_target) .await? .into_data(); @@ -358,7 +358,7 @@ pub(crate) async fn list_timeline_blobs( enum ListTimelineBlobsResult { /// Blob data is ready to be intepreted. Ready(RemoteTimelineBlobData), - /// List timeline blobs has layer files but is missing [`IndexPart`]. + /// The listing contained an index but when we tried to fetch it, we couldn't MissingIndexPart(RemoteTimelineBlobData), } @@ -467,19 +467,19 @@ async fn list_timeline_blobs_impl( match index_part_object.as_ref() { Some(selected) => index_part_keys.retain(|k| k != selected), None => { - // It is possible that the branch gets deleted after we got some layer files listed - // and we no longer have the index file in the listing. - errors.push( + // This case does not indicate corruption, but it should be very unusual. It can + // happen if: + // - timeline creation is in progress (first layer is written before index is written) + // - timeline deletion happened while a stale pageserver was still attached, it might upload + // a layer after the deletion is done. + tracing::info!( "S3 list response got no index_part.json file but still has layer files" - .to_string(), ); - return Ok(ListTimelineBlobsResult::MissingIndexPart( - RemoteTimelineBlobData { - blob_data: BlobDataParseResult::Incorrect { errors, s3_layers }, - unused_index_keys: index_part_keys, - unknown_keys, - }, - )); + return Ok(ListTimelineBlobsResult::Ready(RemoteTimelineBlobData { + blob_data: BlobDataParseResult::Relic, + unused_index_keys: index_part_keys, + unknown_keys, + })); } } From 502d512fe2ca9f07392428d70d5262cf3f5103e2 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 19 Dec 2024 14:04:42 +0000 Subject: [PATCH 50/56] safekeeper: lift benchmarking utils into safekeeper crate (#10200) ## Problem The benchmarking utilities are also useful for testing. We want to write tests in the safekeeper crate. ## Summary of changes This commit lifts the utils to the safekeeper crate. They are compiled if the benchmarking features is enabled or if in test mode. --- libs/postgres_ffi/src/wal_generator.rs | 6 ++-- safekeeper/Cargo.toml | 2 ++ safekeeper/benches/receive_wal.rs | 23 +++++++-------- safekeeper/src/lib.rs | 3 ++ .../benchutils.rs => src/test_utils.rs} | 28 ++++++++++--------- .../tests/walproposer_sim/walproposer_disk.rs | 2 +- 6 files changed, 36 insertions(+), 28 deletions(-) rename safekeeper/{benches/benchutils.rs => src/test_utils.rs} (78%) diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs index 69cc4b771fa1..a72b035e17bc 100644 --- a/libs/postgres_ffi/src/wal_generator.rs +++ b/libs/postgres_ffi/src/wal_generator.rs @@ -106,11 +106,11 @@ impl WalGenerator { const TIMELINE_ID: u32 = 1; /// Creates a new WAL generator with the given record generator. - pub fn new(record_generator: R) -> WalGenerator { + pub fn new(record_generator: R, start_lsn: Lsn) -> WalGenerator { Self { record_generator, - lsn: Lsn(0), - prev_lsn: Lsn(0), + lsn: start_lsn, + prev_lsn: start_lsn, } } diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 086407603f80..3ebb7097f200 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -9,6 +9,7 @@ default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] +benchmarking = [] [dependencies] async-stream.workspace = true @@ -77,3 +78,4 @@ tracing-subscriber = { workspace = true, features = ["json"] } [[bench]] name = "receive_wal" harness = false +required-features = ["benchmarking"] diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 313d945b942f..996c4d9b8c87 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -1,11 +1,7 @@ //! WAL ingestion benchmarks. -#[path = "benchutils.rs"] -mod benchutils; - use std::io::Write as _; -use benchutils::Env; use bytes::BytesMut; use camino_tempfile::tempfile; use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; @@ -16,6 +12,7 @@ use safekeeper::receive_wal::{self, WalAcceptor}; use safekeeper::safekeeper::{ AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, }; +use safekeeper::test_utils::Env; use tokio::io::AsyncWriteExt as _; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -76,12 +73,15 @@ fn bench_process_msg(c: &mut Criterion) { assert!(size >= prefixlen); let message = vec![0; size - prefixlen]; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message)); + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0)); // Set up the Safekeeper. let env = Env::new(fsync)?; - let mut safekeeper = - runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?; + let mut safekeeper = runtime.block_on(env.make_safekeeper( + NodeId(1), + TenantTimelineId::generate(), + Lsn(0), + ))?; b.iter_batched_ref( // Pre-construct WAL records and requests. Criterion will batch them. @@ -134,7 +134,8 @@ fn bench_wal_acceptor(c: &mut Criterion) { let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded let env = Env::new(fsync)?; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message")); + let walgen = + &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"), Lsn(0)); // Create buffered channels that can fit all requests, to avoid blocking on channels. let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n); @@ -145,7 +146,7 @@ fn bench_wal_acceptor(c: &mut Criterion) { // TODO: WalAcceptor doesn't actually need a full timeline, only // Safekeeper::process_msg(). Consider decoupling them to simplify the setup. let tli = env - .make_timeline(NodeId(1), TenantTimelineId::generate()) + .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0)) .await? .wal_residence_guard() .await?; @@ -239,7 +240,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { assert!(size >= prefixlen); let message = vec![0; size - prefixlen]; - let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message)); + let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), Lsn(0)); // Construct and spawn the WalAcceptor task. let env = Env::new(fsync)?; @@ -249,7 +250,7 @@ fn bench_wal_acceptor_throughput(c: &mut Criterion) { runtime.block_on(async { let tli = env - .make_timeline(NodeId(1), TenantTimelineId::generate()) + .make_timeline(NodeId(1), TenantTimelineId::generate(), Lsn(0)) .await? .wal_residence_guard() .await?; diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index abe6e00a665b..7acf355e6a71 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -43,6 +43,9 @@ pub mod wal_reader_stream; pub mod wal_service; pub mod wal_storage; +#[cfg(any(test, feature = "benchmarking"))] +pub mod test_utils; + mod timelines_global_map; use std::sync::Arc; pub use timelines_global_map::GlobalTimelines; diff --git a/safekeeper/benches/benchutils.rs b/safekeeper/src/test_utils.rs similarity index 78% rename from safekeeper/benches/benchutils.rs rename to safekeeper/src/test_utils.rs index 48d796221b43..c40a8bae5a7d 100644 --- a/safekeeper/benches/benchutils.rs +++ b/safekeeper/src/test_utils.rs @@ -1,18 +1,18 @@ use std::sync::Arc; +use crate::rate_limit::RateLimiter; +use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory}; +use crate::state::{TimelinePersistentState, TimelineState}; +use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; +use crate::timelines_set::TimelinesSet; +use crate::wal_backup::remote_timeline_path; +use crate::{control_file, wal_storage, SafeKeeperConf}; use camino_tempfile::Utf8TempDir; -use safekeeper::rate_limit::RateLimiter; -use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory}; -use safekeeper::state::{TimelinePersistentState, TimelineState}; -use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline}; -use safekeeper::timelines_set::TimelinesSet; -use safekeeper::wal_backup::remote_timeline_path; -use safekeeper::{control_file, wal_storage, SafeKeeperConf}; use tokio::fs::create_dir_all; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; -/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop. +/// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. pub struct Env { /// Whether to enable fsync. pub fsync: bool, @@ -21,7 +21,7 @@ pub struct Env { } impl Env { - /// Creates a new benchmarking environment in a temporary directory. fsync controls whether to + /// Creates a new test or benchmarking environment in a temporary directory. fsync controls whether to /// enable fsyncing. pub fn new(fsync: bool) -> anyhow::Result { let tempdir = camino_tempfile::tempdir()?; @@ -47,6 +47,7 @@ impl Env { &self, node_id: NodeId, ttid: TenantTimelineId, + start_lsn: Lsn, ) -> anyhow::Result> { let conf = self.make_conf(node_id); @@ -67,9 +68,9 @@ impl Env { safekeeper .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected { term: 1, - start_streaming_at: Lsn(0), - term_history: TermHistory(vec![(1, Lsn(0)).into()]), - timeline_start_lsn: Lsn(0), + start_streaming_at: start_lsn, + term_history: TermHistory(vec![(1, start_lsn).into()]), + timeline_start_lsn: start_lsn, })) .await?; @@ -82,12 +83,13 @@ impl Env { &self, node_id: NodeId, ttid: TenantTimelineId, + start_lsn: Lsn, ) -> anyhow::Result> { let conf = Arc::new(self.make_conf(node_id)); let timeline_dir = get_timeline_dir(&conf, &ttid); let remote_path = remote_timeline_path(&ttid)?; - let safekeeper = self.make_safekeeper(node_id, ttid).await?; + let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?; let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); let timeline = Timeline::new( diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs index aefb3919a1b3..7dc7f485487b 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -18,7 +18,7 @@ impl DiskWalProposer { internal_available_lsn: Lsn(0), prev_lsn: Lsn(0), disk: BlockStorage::new(), - wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[])), + wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[]), Lsn(0)), }), }) } From 628451d68ec30c0fb591e14a64f696b031d7ca88 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 19 Dec 2024 14:04:46 +0000 Subject: [PATCH 51/56] safekeeper: short-circuit interpreted wal sender (#10202) ## Problem Safekeeper may currently send a batch to the pageserver even if it hasn't decoded a new record. I think this is quite unlikely in the field, but worth adressing. ## Summary of changes Don't send anything if we haven't decoded a full record. Once this merges and releases, the `InterpretedWalRecords` struct can be updated to remove the Option wrapper for `next_record_lsn`. --- safekeeper/src/send_interpreted_wal.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 25890304221e..7d215176dd91 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -94,9 +94,14 @@ impl InterpretedWalSender<'_, IO> { } } + let max_next_record_lsn = match max_next_record_lsn { + Some(lsn) => lsn, + None => { continue; } + }; + let batch = InterpretedWalRecords { records, - next_record_lsn: max_next_record_lsn + next_record_lsn: Some(max_next_record_lsn), }; tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap(); From 04517c6ff3db53b7145aff41a9de208648194a6d Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 19 Dec 2024 17:22:39 +0200 Subject: [PATCH 52/56] Do not reload config file on PS reconnect (#10204) ## Problem See https://github.com/neondatabase/neon/issues/10184 and https://neondb.slack.com/archives/C04DGM6SMTM/p1733997259898819 Reloading config file inside parallel worker cause it's termination ## Summary of changes Remove call of `HandleMainLoopInterrupts()` Update of page server URL is propagated by postmaster through shared memory and we should not reload config for it. Co-authored-by: Konstantin Knizhnik --- pgxn/neon/libpagestore.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 6513ba4dd67c..88d0a5292bf7 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -827,7 +827,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request) { while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { - HandleMainLoopInterrupts(); shard->n_reconnect_attempts += 1; } shard->n_reconnect_attempts = 0; From b89e02f3e8efccfb900685f2d6c1fe18d13cb956 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 19 Dec 2024 13:04:53 -0500 Subject: [PATCH 53/56] fix(pageserver): consider partial compaction layer map in layer check (#10044) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem In https://github.com/neondatabase/neon/pull/9897 we temporarily disabled the layer valid check because the current one only considers the end result of all compaction algorithms, but partial gc-compaction would temporarily produce an "invalid" layer map. part of https://github.com/neondatabase/neon/issues/9114 ## Summary of changes Allow LSN splits to overlap in the slow path check. Currently, the valid check is only used in storage scrubber (background job) and during gc-compaction (without taking layer lock). Therefore, it's fine for such checks to be a little bit inefficient but more accurate. --------- Signed-off-by: Alex Chi Z Co-authored-by: Arpad Müller --- pageserver/src/tenant/checks.rs | 47 +++++++++++++------ pageserver/src/tenant/timeline/compaction.rs | 48 ++++++++++++++++---- 2 files changed, 71 insertions(+), 24 deletions(-) diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index 1e8fa8d1d64e..f98356242e1c 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -1,12 +1,15 @@ use std::collections::BTreeSet; use itertools::Itertools; +use pageserver_compaction::helpers::overlaps_with; use super::storage_layer::LayerName; /// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). /// -/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, +/// The function implements a fast path check and a slow path check. +/// +/// The fast path checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, /// /// ```plain /// | | | | @@ -25,31 +28,47 @@ use super::storage_layer::LayerName; /// | | | 4 | | | /// /// If layer 2 and 4 contain the same single key, this is also a valid layer map. +/// +/// However, if a partial compaction is still going on, it is possible that we get a layer map not satisfying the above condition. +/// Therefore, we fallback to simply check if any of the two delta layers overlap. (See "A slow path...") pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) let mut all_delta_layers = Vec::new(); for name in metadata { if let LayerName::Delta(layer) = name { - if layer.key_range.start.next() != layer.key_range.end { - all_delta_layers.push(layer.clone()); - } + all_delta_layers.push(layer.clone()); } } for layer in &all_delta_layers { - let lsn_range = &layer.lsn_range; - lsn_split_point.insert(lsn_range.start); - lsn_split_point.insert(lsn_range.end); + if layer.key_range.start.next() != layer.key_range.end { + let lsn_range = &layer.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } } - for layer in &all_delta_layers { + for (idx, layer) in all_delta_layers.iter().enumerate() { + if layer.key_range.start.next() == layer.key_range.end { + continue; + } let lsn_range = layer.lsn_range.clone(); let intersects = lsn_split_point.range(lsn_range).collect_vec(); if intersects.len() > 1 { - let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", - layer, - intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") - ); - return Some(err); + // A slow path to check if the layer intersects with any other delta layer. + for (other_idx, other_layer) in all_delta_layers.iter().enumerate() { + if other_idx == idx { + // do not check self intersects with self + continue; + } + if overlaps_with(&layer.lsn_range, &other_layer.lsn_range) + && overlaps_with(&layer.key_range, &other_layer.key_range) + { + let err = format!( + "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", + layer, other_layer + ); + return Some(err); + } + } } } None diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a4e8f3952265..01f2a5b170d2 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -29,6 +29,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; use crate::statvfs::Statvfs; +use crate::tenant::checks::check_valid_layermap; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::batch_split_writer::{ BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter, @@ -2156,15 +2157,14 @@ impl Timeline { // Step 1: construct a k-merge iterator over all layers. // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point. - // disable the check for now because we need to adjust the check for partial compactions, will enable later. - // let layer_names = job_desc - // .selected_layers - // .iter() - // .map(|layer| layer.layer_desc().layer_name()) - // .collect_vec(); - // if let Some(err) = check_valid_layermap(&layer_names) { - // warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err); - // } + let layer_names = job_desc + .selected_layers + .iter() + .map(|layer| layer.layer_desc().layer_name()) + .collect_vec(); + if let Some(err) = check_valid_layermap(&layer_names) { + bail!("gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss", err); + } // The maximum LSN we are processing in this compaction loop let end_lsn = job_desc .selected_layers @@ -2546,8 +2546,36 @@ impl Timeline { ); // Step 3: Place back to the layer map. + + // First, do a sanity check to ensure the newly-created layer map does not contain overlaps. + let all_layers = { + let guard = self.layers.read().await; + let layer_map = guard.layer_map()?; + layer_map.iter_historic_layers().collect_vec() + }; + + let mut final_layers = all_layers + .iter() + .map(|layer| layer.layer_name()) + .collect::>(); + for layer in &layer_selection { + final_layers.remove(&layer.layer_desc().layer_name()); + } + for layer in &compact_to { + final_layers.insert(layer.layer_desc().layer_name()); + } + let final_layers = final_layers.into_iter().collect_vec(); + + // TODO: move this check before we call `finish` on image layer writers. However, this will require us to get the layer name before we finish + // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are + // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails. + if let Some(err) = check_valid_layermap(&final_layers) { + bail!("gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss", err); + } + + // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only + // operate on L1 layers. { - // TODO: sanity check if the layer map is valid (i.e., should not have overlaps) let mut guard = self.layers.write().await; guard .open_mut()? From 197a89ab3dee0ff90b060c92032a1a8e0b3213a8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 19 Dec 2024 20:32:32 +0200 Subject: [PATCH 54/56] =?UTF-8?q?Increase=20default=20stotrage=20controlle?= =?UTF-8?q?r=20heartbeat=20interval=20from=20100msec=20=E2=80=A6=20(#10206?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Currently default value of storage controller heartbeat interval is 100msec. It means that 10 times per second it establish connection to PS. And it seems to be quite expensive. At MacOS right now storage_controller consumes 70% CPU and trusts - 30%. So together they completely utilize one core. A lot of us has Macs. Let's save environment a little bit and do not waste electricity and contribute to global warming. By the way, on prod we have interval 10seconds ## Summary of changes Increase heartbeat interval from 100msec to 1 second. Co-authored-by: Konstantin Knizhnik --- control_plane/src/local_env.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 489f9c850963..5b82acb3a53f 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -180,7 +180,7 @@ impl NeonStorageControllerConf { const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); // Very tight heartbeat interval to speed up tests - const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100); + const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(1000); } impl Default for NeonStorageControllerConf { From 9c53b41245e3aecba30c2e05df4eeabe45fd39ac Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 19 Dec 2024 13:40:20 -0500 Subject: [PATCH 55/56] fix(pageserver): update remote latest_gc_cutoff after gc-compaction (#10209) ## Problem close https://github.com/neondatabase/neon/issues/10208 part of #9114 ## Summary of changes * Ensure remote `latest_gc_cutoff` is up-to-date before removing any files for gc-compaction. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 01f2a5b170d2..94c65631b206 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -2581,6 +2581,13 @@ impl Timeline { .open_mut()? .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics) }; + + // Schedule an index-only upload to update the `latest_gc_cutoff` in the index_part.json. + // Otherwise, after restart, the index_part only contains the old `latest_gc_cutoff` and + // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should + // be batched into `schedule_compaction_update`. + let disk_consistent_lsn = self.disk_consistent_lsn.load(); + self.schedule_uploads(disk_consistent_lsn, None)?; self.remote_client .schedule_compaction_update(&layer_selection, &compact_to)?; From 7fe6b2442f10b36c91df81569f9a1ddf2ba531d1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 20 Dec 2024 06:01:57 +0000 Subject: [PATCH 56/56] Storage release 2024-12-20