Skip to content

Commit

Permalink
Fix autocommit footguns in performance tests
Browse files Browse the repository at this point in the history
psycopg2 has the following warning related to autocommit:

> By default, any query execution, including a simple SELECT will start
> a transaction: for long-running programs, if no further action is
> taken, the session will remain “idle in transaction”, an undesirable
> condition for several reasons (locks are held by the session, tables
> bloat…). For long lived scripts, either ensure to terminate a
> transaction as soon as possible or use an autocommit connection.

In the 2.9 release notes, psycopg2 also made the following change:

> `with connection` starts a transaction on autocommit transactions too

Some of these connections are indeed long-lived, so we were retaining
tons of WAL on the endpoints because we had a transaction pinned in the
past.

Link: https://www.psycopg.org/docs/news.html#what-s-new-in-psycopg-2-9
Link: psycopg/psycopg2#941
Signed-off-by: Tristan Partin <[email protected]>
  • Loading branch information
tristan957 authored Nov 12, 2024
1 parent 2256a57 commit d8f5d43
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 75 deletions.
113 changes: 61 additions & 52 deletions test_runner/performance/test_logical_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,16 @@ def test_subscriber_lag(
check_pgbench_still_running(pub_workload, "pub")
check_pgbench_still_running(sub_workload, "sub")

with (
psycopg2.connect(pub_connstr) as pub_conn,
psycopg2.connect(sub_connstr) as sub_conn,
):
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True

with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)

pub_conn.close()
sub_conn.close()

log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
Expand Down Expand Up @@ -206,6 +210,7 @@ def test_publisher_restart(
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True

with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
pub_exists = len(pub_cur.fetchall()) != 0
Expand All @@ -222,6 +227,7 @@ def test_publisher_restart(
sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")

initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)

pub_conn.close()
sub_conn.close()

Expand All @@ -248,12 +254,17 @@ def test_publisher_restart(
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
env=pub_env,
)
with (
psycopg2.connect(pub_connstr) as pub_conn,
psycopg2.connect(sub_connstr) as sub_conn,
):
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)

pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True

with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)

pub_conn.close()
sub_conn.close()

log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
Expand Down Expand Up @@ -288,58 +299,56 @@ def test_snap_files(
env = benchmark_project_pub.pgbench_env
connstr = benchmark_project_pub.connstr

with psycopg2.connect(connstr) as conn:
conn.autocommit = True
with conn.cursor() as cur:
cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
is_super = cast("bool", cur.fetchall()[0][0])
assert is_super, "This benchmark won't work if we don't have superuser"
conn = psycopg2.connect(connstr)
conn.autocommit = True

with conn.cursor() as cur:
cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
is_super = cast("bool", cur.fetchall()[0][0])
assert is_super, "This benchmark won't work if we don't have superuser"

conn.close()

pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)

conn = psycopg2.connect(connstr)
conn.autocommit = True
cur = conn.cursor()
cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")

with psycopg2.connect(connstr) as conn:
conn.autocommit = True
with conn.cursor() as cur:
cur.execute("SELECT pg_reload_conf()")

with psycopg2.connect(connstr) as conn:
conn.autocommit = True
with conn.cursor() as cur:
cur.execute(
"""
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_replication_slots
WHERE slot_name = 'slotter'
) THEN
PERFORM pg_drop_replication_slot('slotter');
END IF;
END $$;

with conn.cursor() as cur:
cur.execute(
"""
)
cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
DO $$
BEGIN
IF EXISTS (
SELECT 1
FROM pg_replication_slots
WHERE slot_name = 'slotter'
) THEN
PERFORM pg_drop_replication_slot('slotter');
END IF;
END $$;
"""
)
cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")

conn.close()

workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
try:
start = time.time()
prev_measurement = time.time()
while time.time() - start < test_duration_min * 60:
with psycopg2.connect(connstr) as conn:
with conn.cursor() as cur:
cur.execute(
"SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
)
check_pgbench_still_running(workload)
cur.execute(
"SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
)
conn = psycopg2.connect(connstr)
conn.autocommit = True

with conn.cursor() as cur:
cur.execute(
"SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
)
check_pgbench_still_running(workload)
cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())")

conn.close()

# Measure storage
if time.time() - prev_measurement > test_interval_min * 60:
Expand Down
61 changes: 38 additions & 23 deletions test_runner/performance/test_physical_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,21 @@ def test_ro_replica_lag(
check_pgbench_still_running(master_workload)
check_pgbench_still_running(replica_workload)
time.sleep(sync_interval_min * 60)

conn_master = psycopg2.connect(master_connstr)
conn_replica = psycopg2.connect(replica_connstr)
conn_master.autocommit = True
conn_replica.autocommit = True

with (
psycopg2.connect(master_connstr) as conn_master,
psycopg2.connect(replica_connstr) as conn_replica,
conn_master.cursor() as cur_master,
conn_replica.cursor() as cur_replica,
):
with (
conn_master.cursor() as cur_master,
conn_replica.cursor() as cur_replica,
):
lag = measure_replication_lag(cur_master, cur_replica)
lag = measure_replication_lag(cur_master, cur_replica)

conn_master.close()
conn_replica.close()

log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
finally:
Expand Down Expand Up @@ -219,11 +225,15 @@ def test_replication_start_stop(
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)

# Sync replicas
with psycopg2.connect(master_connstr) as conn_master:
with conn_master.cursor() as cur_master:
for i in range(num_replicas):
conn_replica = psycopg2.connect(replica_connstr[i])
measure_replication_lag(cur_master, conn_replica.cursor())
conn_master = psycopg2.connect(master_connstr)
conn_master.autocommit = True

with conn_master.cursor() as cur_master:
for i in range(num_replicas):
conn_replica = psycopg2.connect(replica_connstr[i])
measure_replication_lag(cur_master, conn_replica.cursor())

conn_master.close()

master_pgbench = pg_bin.run_nonblocking(
[
Expand Down Expand Up @@ -277,17 +287,22 @@ def replica_enabled(iconfig: int = iconfig):

time.sleep(configuration_test_time_sec)

with psycopg2.connect(master_connstr) as conn_master:
with conn_master.cursor() as cur_master:
for ireplica in range(num_replicas):
replica_conn = psycopg2.connect(replica_connstr[ireplica])
lag = measure_replication_lag(cur_master, replica_conn.cursor())
zenbenchmark.record(
f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
)
log.info(
f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
)
conn_master = psycopg2.connect(master_connstr)
conn_master.autocommit = True

with conn_master.cursor() as cur_master:
for ireplica in range(num_replicas):
replica_conn = psycopg2.connect(replica_connstr[ireplica])
lag = measure_replication_lag(cur_master, replica_conn.cursor())
zenbenchmark.record(
f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
)
log.info(
f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
)

conn_master.close()

master_pgbench.terminate()
except Exception as e:
error_occurred = True
Expand Down

1 comment on commit d8f5d43

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5473 tests run: 5240 passed, 3 failed, 230 skipped (full report)


Failures on Postgres 16

  • test_sharded_ingest[github-actions-selfhosted-1]: release-x86-64
  • test_storage_controller_many_tenants[github-actions-selfhosted]: release-x86-64
  • test_compaction_l0_memory[github-actions-selfhosted]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_sharded_ingest[release-pg16-github-actions-selfhosted-1] or test_storage_controller_many_tenants[release-pg16-github-actions-selfhosted] or test_compaction_l0_memory[release-pg16-github-actions-selfhosted]"
Flaky tests (1)

Postgres 17

Code coverage* (full report)

  • functions: 31.8% (7891 of 24834 functions)
  • lines: 49.5% (62449 of 126257 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
d8f5d43 at 2024-11-12T23:36:16.273Z :recycle:

Please sign in to comment.