From 0fc0e9751e94039b0226bcc84e83807d8465b672 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 22 Jul 2022 11:45:28 +0300 Subject: [PATCH 01/56] Neon rebased to REL_15_STABLE (up to 0a9045c9ff) Most significant changes are: - `xlog.c` refactoring - some code was moved to `xlogreader.c` and `xlogprefetcher.c`. - `ThisTimeLineID` refactoring (4a92a1c3d1c and e997a0c6428), which affects walproposer code - `XLogFileInit` refactoring, Multiple commits changed the function signature. - resolve initdb and pg_waldump neon-specific options that conflictes with the ones from PostgreSQL. - --- .dockerignore | 5 + Dockerfile | 73 + configure | 86 + configure.ac | 13 + contrib/neon/Makefile | 26 + contrib/neon/inmem_smgr.c | 287 ++ contrib/neon/libpagestore.c | 440 ++ contrib/neon/neon--1.0.sql | 17 + contrib/neon/neon.c | 66 + contrib/neon/neon.control | 4 + contrib/neon/pagestore_client.h | 221 + contrib/neon/pagestore_smgr.c | 1698 +++++++ contrib/neon/relsize_cache.c | 186 + contrib/neon_test_utils/Makefile | 25 + .../neon_test_utils/neon_test_utils--1.0.sql | 29 + .../neon_test_utils/neon_test_utils.control | 5 + contrib/neon_test_utils/neontest.c | 304 ++ src/Makefile | 1 + src/Makefile.global.in | 1 + src/backend/access/brin/brin_xlog.c | 3 +- src/backend/access/gin/gininsert.c | 7 + src/backend/access/gin/ginxlog.c | 19 +- src/backend/access/gist/gistbuild.c | 17 +- src/backend/access/heap/heapam.c | 26 +- src/backend/access/heap/heapam_handler.c | 2 +- src/backend/access/heap/vacuumlazy.c | 5 +- src/backend/access/heap/visibilitymap.c | 15 +- src/backend/access/spgist/spginsert.c | 8 +- src/backend/access/transam/xlog.c | 200 +- src/backend/access/transam/xloginsert.c | 15 + src/backend/access/transam/xlogprefetcher.c | 3 +- src/backend/access/transam/xlogreader.c | 119 +- src/backend/access/transam/xlogrecovery.c | 94 +- src/backend/access/transam/xlogutils.c | 20 +- src/backend/bootstrap/bootstrap.c | 13 +- src/backend/catalog/storage.c | 10 +- src/backend/commands/sequence.c | 6 +- src/backend/commands/tablecmds.c | 2 +- src/backend/main/main.c | 7 + src/backend/port/sysv_shmem.c | 15 + src/backend/postmaster/Makefile | 5 + src/backend/postmaster/bgworker.c | 4 + src/backend/postmaster/postmaster.c | 6 + src/backend/postmaster/seccomp.c | 249 + src/backend/replication/Makefile | 4 +- .../replication/libpqwalproposer/Makefile | 37 + .../libpqwalproposer/libpqwalproposer.c | 416 ++ src/backend/replication/walproposer.c | 2357 +++++++++ src/backend/replication/walproposer_utils.c | 404 ++ src/backend/replication/walreceiver.c | 7 + src/backend/replication/walsender.c | 325 +- src/backend/storage/buffer/bufmgr.c | 51 +- src/backend/storage/buffer/localbuf.c | 14 +- src/backend/storage/freespace/freespace.c | 14 +- src/backend/storage/ipc/ipci.c | 4 + src/backend/storage/page/bufpage.c | 1 - src/backend/storage/smgr/md.c | 4 +- src/backend/storage/smgr/smgr.c | 205 +- src/backend/tcop/Makefile | 2 + src/backend/tcop/postgres.c | 36 +- src/backend/tcop/zenith_wal_redo.c | 813 +++ src/backend/utils/activity/wait_event.c | 3 + src/backend/utils/adt/dbsize.c | 61 +- src/backend/utils/misc/guc.c | 84 + src/bin/initdb/initdb.c | 4 + src/bin/pg_waldump/pg_waldump.c | 204 +- src/include/access/heapam_xlog.h | 6 +- src/include/access/xlog.h | 21 + src/include/access/xlogdefs.h | 8 + src/include/access/xloginsert.h | 4 + src/include/access/xlogreader.h | 4 + src/include/access/xlogutils.h | 2 + src/include/miscadmin.h | 3 + src/include/pg_config.h.in | 3 + src/include/postmaster/seccomp.h | 26 + src/include/replication/walproposer.h | 565 +++ src/include/replication/walsender.h | 4 +- src/include/storage/bufmgr.h | 4 + src/include/storage/smgr.h | 72 +- src/include/tcop/tcopprot.h | 4 + src/include/utils/rel.h | 2 +- src/include/utils/wait_event.h | 3 +- src/test/regress/expected/alter_table_1.out | 4487 +++++++++++++++++ src/test/regress/expected/create_table_1.out | 1315 +++++ src/test/regress/expected/sequence_1.out | 824 +++ src/test/regress/output/tablespace_1.source | 941 ++++ src/tools/pgindent/typedefs.list | 15 + 87 files changed, 17403 insertions(+), 317 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 contrib/neon/Makefile create mode 100644 contrib/neon/inmem_smgr.c create mode 100644 contrib/neon/libpagestore.c create mode 100644 contrib/neon/neon--1.0.sql create mode 100644 contrib/neon/neon.c create mode 100644 contrib/neon/neon.control create mode 100644 contrib/neon/pagestore_client.h create mode 100644 contrib/neon/pagestore_smgr.c create mode 100644 contrib/neon/relsize_cache.c create mode 100644 contrib/neon_test_utils/Makefile create mode 100644 contrib/neon_test_utils/neon_test_utils--1.0.sql create mode 100644 contrib/neon_test_utils/neon_test_utils.control create mode 100644 contrib/neon_test_utils/neontest.c create mode 100644 src/backend/postmaster/seccomp.c create mode 100644 src/backend/replication/libpqwalproposer/Makefile create mode 100644 src/backend/replication/libpqwalproposer/libpqwalproposer.c create mode 100644 src/backend/replication/walproposer.c create mode 100644 src/backend/replication/walproposer_utils.c create mode 100644 src/backend/tcop/zenith_wal_redo.c create mode 100644 src/include/postmaster/seccomp.h create mode 100644 src/include/replication/walproposer.h create mode 100644 src/test/regress/expected/alter_table_1.out create mode 100644 src/test/regress/expected/create_table_1.out create mode 100644 src/test/regress/expected/sequence_1.out create mode 100644 src/test/regress/output/tablespace_1.source diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000000..530192a3b20 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +.git +.vscode +.circleci +tmp_install +compute_build diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000000..11681c9cb16 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,73 @@ +# Allow specifiyng the different compute-tools tag, so we were able to always use +# the locally built image. +ARG COMPUTE_TOOLS_TAG=latest + +# +# Image with pre-built tools +# +FROM neondatabase/compute-tools:$COMPUTE_TOOLS_TAG AS compute-deps +# Only to get ready compute_ctl binary as deppendency + +# +# Image with Postgres build deps +# +FROM debian:buster-slim AS build-deps + +RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ + libcurl4-openssl-dev libossp-uuid-dev + +# +# Image with built Postgres +# +FROM build-deps AS pg-build + +# Add user postgres +RUN adduser postgres +RUN mkdir /pg && chown postgres:postgres /pg + +# Copy source files +COPY . /pg/ + +# Build and install Postgres locally +RUN mkdir /pg/compute_build && cd /pg/compute_build && \ + ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \ + # Install main binaries and contribs + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/neon install && \ + # Install headers + make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install + +USER postgres +WORKDIR /pg + +# +# Final compute node image to be exported +# +FROM debian:buster-slim + +# libreadline-dev is required to run psql +RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev + +# Add user postgres +RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ + echo "postgres:test_console_pass" | chpasswd && \ + mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ + chown -R postgres:postgres /var/db/postgres && \ + chmod 0750 /var/db/postgres/compute + +# Copy ready Postgres binaries +COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local + +# Copy binaries from compute-tools +COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl + +# XXX: temporary symlink for compatibility with old control-plane +RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl + +# Add postgres shared objects to the search path +RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig + +USER postgres + +ENTRYPOINT ["/usr/local/bin/compute_ctl"] diff --git a/configure b/configure index 57607d79dff..376394da894 100755 --- a/configure +++ b/configure @@ -717,6 +717,7 @@ with_libxml with_uuid with_readline with_systemd +with_libseccomp with_selinux with_ldap with_krb_srvnam @@ -864,6 +865,7 @@ with_bsd_auth with_ldap with_bonjour with_selinux +with_libseccomp with_systemd with_readline with_libedit_preferred @@ -1573,6 +1575,7 @@ Optional Packages: --with-ldap build with LDAP support --with-bonjour build with Bonjour support --with-selinux build with SELinux support + --with-libseccomp build with libseccomp support --with-systemd build with systemd support --without-readline do not use GNU Readline nor BSD Libedit for editing --with-libedit-preferred @@ -8631,6 +8634,39 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_selinux" >&5 $as_echo "$with_selinux" >&6; } +# +# libseccomp +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libseccomp support" >&5 +$as_echo_n "checking whether to build with libseccomp support... " >&6; } + + + +# Check whether --with-libseccomp was given. +if test "${with_libseccomp+set}" = set; then : + withval=$with_libseccomp; + case $withval in + yes) + : + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --with-libseccomp option" "$LINENO" 5 + ;; + esac + +else + with_libseccomp=no + +fi + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libseccomp" >&5 +$as_echo "$with_libseccomp" >&6; } + # # Systemd # @@ -14587,6 +14623,56 @@ else fi +fi + +if test "$with_libseccomp" = yes ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for seccomp_init in -lseccomp" >&5 +$as_echo_n "checking for seccomp_init in -lseccomp... " >&6; } +if ${ac_cv_lib_seccomp_seccomp_init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lseccomp $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char seccomp_init (); +int +main () +{ +return seccomp_init (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_seccomp_seccomp_init=yes +else + ac_cv_lib_seccomp_seccomp_init=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_seccomp_seccomp_init" >&5 +$as_echo "$ac_cv_lib_seccomp_seccomp_init" >&6; } +if test "x$ac_cv_lib_seccomp_seccomp_init" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBSECCOMP 1 +_ACEOF + + LIBS="-lseccomp $LIBS" + +else + as_fn_error $? "library 'libseccomp' is required for Seccomp BPF support" "$LINENO" 5 +fi + fi # for contrib/uuid-ossp diff --git a/configure.ac b/configure.ac index c216ac4447a..0a5e5110335 100644 --- a/configure.ac +++ b/configure.ac @@ -927,6 +927,14 @@ PGAC_ARG_BOOL(with, selinux, no, [build with SELinux support]) AC_SUBST(with_selinux) AC_MSG_RESULT([$with_selinux]) +# +# libseccomp +# +AC_MSG_CHECKING([whether to build with libseccomp support]) +PGAC_ARG_BOOL(with, libseccomp, no, [build with libseccomp support]) +AC_SUBST(with_libseccomp) +AC_MSG_RESULT([$with_libseccomp]) + # # Systemd # @@ -1613,6 +1621,11 @@ dnl If you want to use Apple's own Bonjour code on another platform, dnl just add -ldns_sd to LIBS manually. fi +if test "$with_libseccomp" = yes ; then + AC_CHECK_LIB(seccomp, seccomp_init, [], + [AC_MSG_ERROR([library 'libseccomp' is required for Seccomp BPF support])]) +fi + # for contrib/uuid-ossp if test "$with_uuid" = bsd ; then AC_CHECK_HEADERS(uuid.h, diff --git a/contrib/neon/Makefile b/contrib/neon/Makefile new file mode 100644 index 00000000000..b6f3cf400ff --- /dev/null +++ b/contrib/neon/Makefile @@ -0,0 +1,26 @@ +# contrib/neon/Makefile + + +MODULE_big = neon +OBJS = \ + $(WIN32RES) \ + inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o neon.o + +PG_CPPFLAGS = -I$(libpq_srcdir) +SHLIB_LINK_INTERNAL = $(libpq) + +EXTENSION = neon +DATA = neon--1.0.sql +PGFILEDESC = "neon - cloud storage for PostgreSQL" + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +SHLIB_PREREQS = submake-libpq +subdir = contrib/neon +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/neon/inmem_smgr.c b/contrib/neon/inmem_smgr.c new file mode 100644 index 00000000000..ca3a254a143 --- /dev/null +++ b/contrib/neon/inmem_smgr.c @@ -0,0 +1,287 @@ +/*------------------------------------------------------------------------- + * + * inmem_smgr.c + * + * This is an implementation of the SMGR interface, used in the WAL redo + * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent + * storage, the pages that are written out are kept in a small number of + * in-memory buffers. + * + * Normally, replaying a WAL record only needs to access a handful of + * buffers, which fit in the normal buffer cache, so this is just for + * "overflow" storage when the buffer cache is not large enough. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * contrib/neon/inmem_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "pagestore_client.h" +#include "storage/block.h" +#include "storage/buf_internals.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" + +/* Size of the in-memory smgr */ +#define MAX_PAGES 64 + +/* If more than WARN_PAGES are used, print a warning in the log */ +#define WARN_PAGES 32 + +static BufferTag page_tag[MAX_PAGES]; +static char page_body[MAX_PAGES][BLCKSZ]; +static int used_pages; + +static int +locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) +{ + /* We only hold a small number of pages, so linear search */ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum + && blkno == page_tag[i].blockNum) + { + return i; + } + } + return -1; +} + +/* + * inmem_init() -- Initialize private state + */ +void +inmem_init(void) +{ + used_pages = 0; +} + +/* + * inmem_exists() -- Does the physical file exist? + */ +bool +inmem_exists(SMgrRelation reln, ForkNumber forknum) +{ + for (int i = 0; i < used_pages; i++) + { + if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) + && forknum == page_tag[i].forkNum) + { + return true; + } + } + return false; +} + +/* + * inmem_create() -- Create a new relation on zenithd storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_unlink() -- Unlink a relation. + */ +void +inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +{ +} + +/* + * inmem_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + /* same as smgwrite() for us */ + inmem_write(reln, forknum, blkno, buffer, skipFsync); +} + +/* + * inmem_open() -- Initialize newly-opened relation. + */ +void +inmem_open(SMgrRelation reln) +{ +} + +/* + * inmem_close() -- Close the specified relation, if it isn't closed already. + */ +void +inmem_close(SMgrRelation reln, ForkNumber forknum) +{ +} + +/* + * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + return true; +} + +/* + * inmem_writeback() -- Tell the kernel to write pages back to storage. + */ +void +inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ +} + +/* + * inmem_read() -- Read the specified block from a relation. + */ +void +inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *buffer) +{ + int pg; + + pg = locate_page(reln, forknum, blkno); + if (pg < 0) + memset(buffer, 0, BLCKSZ); + else + memcpy(buffer, page_body[pg], BLCKSZ); +} + +/* + * inmem_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + int pg; + + pg = locate_page(reln, forknum, blocknum); + if (pg < 0) + { + /* + * We assume the buffer cache is large enough to hold all the buffers + * needed for most operations. Overflowing to this "in-mem smgr" in rare + * cases is OK. But if we find that we're using more than WARN_PAGES, + * print a warning so that we get alerted and get to investigate why + * we're accessing so many buffers. + */ + elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, + "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + if (used_pages == MAX_PAGES) + elog(ERROR, "Inmem storage overflow"); + + pg = used_pages; + used_pages++; + INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); + } else { + elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + blocknum, + used_pages); + } + memcpy(page_body[pg], buffer, BLCKSZ); +} + +/* + * inmem_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +inmem_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + /* + * It's not clear why a WAL redo function would call smgrnblocks(). + * During recovery, at least before reaching consistency, the size of a + * relation could be arbitrarily small, if it was truncated after the + * record being replayed, or arbitrarily large if it was extended + * afterwards. But one place where it's called is in + * XLogReadBufferExtended(): it extends the relation, if it's smaller than + * the requested page. That's a waste of time in the WAL redo + * process. Pretend that all relations are maximally sized to avoid it. + */ + return MaxBlockNumber; +} + +/* + * inmem_truncate() -- Truncate relation to specified number of blocks. + */ +void +inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ +} + +/* + * inmem_immedsync() -- Immediately sync a relation to stable storage. + */ +void +inmem_immedsync(SMgrRelation reln, ForkNumber forknum) +{ +} + +static const struct f_smgr inmem_smgr = +{ + .smgr_init = inmem_init, + .smgr_shutdown = NULL, + .smgr_open = inmem_open, + .smgr_close = inmem_close, + .smgr_create = inmem_create, + .smgr_exists = inmem_exists, + .smgr_unlink = inmem_unlink, + .smgr_extend = inmem_extend, + .smgr_prefetch = inmem_prefetch, + .smgr_read = inmem_read, + .smgr_write = inmem_write, + .smgr_writeback = inmem_writeback, + .smgr_nblocks = inmem_nblocks, + .smgr_truncate = inmem_truncate, + .smgr_immedsync = inmem_immedsync, +}; + +const f_smgr * +smgr_inmem(BackendId backend, RelFileNode rnode) +{ + Assert(InRecovery); + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &inmem_smgr; +} + +void +smgr_init_inmem() +{ + inmem_init(); +} diff --git a/contrib/neon/libpagestore.c b/contrib/neon/libpagestore.c new file mode 100644 index 00000000000..2621421532a --- /dev/null +++ b/contrib/neon/libpagestore.c @@ -0,0 +1,440 @@ +/*------------------------------------------------------------------------- + * + * libpagestore.c + * Handles network communications with the remote pagestore. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/libpqpagestore.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "fmgr.h" +#include "access/xlog.h" + +#include "libpq-fe.h" +#include "libpq/pqformat.h" +#include "libpq/libpq.h" + +#include "miscadmin.h" +#include "pgstat.h" +#include "utils/guc.h" + +#include "replication/walproposer.h" + +PG_MODULE_MAGIC; + +void _PG_init(void); + +#define PageStoreTrace DEBUG5 + +#define NEON_TAG "[NEON_SMGR] " +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ## __VA_ARGS__), \ + errhidestmt(true), errhidecontext(true))) + +bool connected = false; +PGconn *pageserver_conn = NULL; + +char *page_server_connstring_raw; + +static ZenithResponse *pageserver_call(ZenithRequest *request); +page_server_api api = { + .request = pageserver_call +}; + +static void +pageserver_connect() +{ + char *query; + int ret; + + Assert(!connected); + + pageserver_conn = PQconnectdb(page_server_connstring); + + if (PQstatus(pageserver_conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + ereport(ERROR, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "could not establish connection to pageserver"), + errdetail_internal("%s", msg))); + } + + query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline); + ret = PQsendQuery(pageserver_conn, query); + if (ret != 1) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + neon_log(ERROR, "could not send pagestream command to pageserver"); + } + + while (PQisBusy(pageserver_conn)) + { + int wc; + + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_EXIT_ON_PM_DEATH, + PQsocket(pageserver_conn), + -1L, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(pageserver_conn)) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + PQfinish(pageserver_conn); + pageserver_conn = NULL; + + neon_log(ERROR, "could not complete handshake with pageserver: %s", + msg); + } + } + } + + neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); + + connected = true; +} + +/* + * A wrapper around PQgetCopyData that checks for interrupts while sleeping. + */ +static int +call_PQgetCopyData(PGconn *conn, char **buffer) +{ + int ret; + +retry: + ret = PQgetCopyData(conn, buffer, 1 /* async */ ); + + if (ret == 0) + { + int wc; + + /* Sleep until there's something to do */ + wc = WaitLatchOrSocket(MyLatch, + WL_LATCH_SET | WL_SOCKET_READABLE | + WL_EXIT_ON_PM_DEATH, + PQsocket(conn), + -1L, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (wc & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(conn)) + neon_log(ERROR, "could not get response from pageserver: %s", + PQerrorMessage(conn)); + } + + goto retry; + } + + return ret; +} + + +static ZenithResponse * +pageserver_call(ZenithRequest *request) +{ + StringInfoData req_buff; + StringInfoData resp_buff; + ZenithResponse *resp; + + PG_TRY(); + { + /* If the connection was lost for some reason, reconnect */ + if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + { + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + + if (!connected) + pageserver_connect(); + + req_buff = zm_pack_request(request); + + /* + * Send request. + * + * In principle, this could block if the output buffer is full, and we + * should use async mode and check for interrupts while waiting. In + * practice, our requests are small enough to always fit in the output + * and TCP buffer. + */ + if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn)) + { + neon_log(ERROR, "failed to send page request: %s", + PQerrorMessage(pageserver_conn)); + } + pfree(req_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) request); + + neon_log(PageStoreTrace, "sent request: %s", msg); + pfree(msg); + } + + /* read response */ + resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); + resp_buff.cursor = 0; + + if (resp_buff.len == -1) + neon_log(ERROR, "end of COPY"); + else if (resp_buff.len == -2) + neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); + + resp = zm_unpack_response(&resp_buff); + PQfreemem(resp_buff.data); + + if (message_level_is_interesting(PageStoreTrace)) + { + char *msg = zm_to_string((ZenithMessage *) resp); + + neon_log(PageStoreTrace, "got response: %s", msg); + pfree(msg); + } + } + PG_CATCH(); + { + /* + * If anything goes wrong while we were sending a request, it's not + * clear what state the connection is in. For example, if we sent the + * request but didn't receive a response yet, we might receive the + * response some time later after we have already sent a new unrelated + * request. Close the connection to avoid getting confused. + */ + if (connected) + { + neon_log(LOG, "dropping connection to page server due to error"); + PQfinish(pageserver_conn); + pageserver_conn = NULL; + connected = false; + } + PG_RE_THROW(); + } + PG_END_TRY(); + + return (ZenithResponse *) resp; +} + + +static bool +check_zenith_id(char **newval, void **extra, GucSource source) +{ + uint8 zid[16]; + + return **newval == '\0' || HexDecodeString(zid, *newval, 16); +} + +static char * +substitute_pageserver_password(const char *page_server_connstring_raw) +{ + char *host = NULL; + char *port = NULL; + char *user = NULL; + char *auth_token = NULL; + char *err = NULL; + char *page_server_connstring = NULL; + PQconninfoOption *conn_options; + PQconninfoOption *conn_option; + MemoryContext oldcontext; + + /* + * Here we substitute password in connection string with an environment + * variable. To simplify things we construct a connection string back with + * only known options. In particular: host port user and password. We do + * not currently use other options and constructing full connstring in an + * URI shape is quite messy. + */ + + if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0') + return NULL; + + /* extract the auth token from the connection string */ + conn_options = PQconninfoParse(page_server_connstring_raw, &err); + if (conn_options == NULL) + { + /* The error string is malloc'd, so we must free it explicitly */ + char *errcopy = err ? pstrdup(err) : "out of memory"; + + PQfreemem(err); + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid connection string syntax: %s", errcopy))); + } + + /* + * Trying to populate pageserver connection string with auth token from + * environment. We are looking for password in with placeholder value like + * $ENV_VAR_NAME, so if password field is present and starts with $ we try + * to fetch environment variable value and fail loudly if it is not set. + */ + for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++) + { + if (strcmp(conn_option->keyword, "host") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + host = conn_option->val; + } + else if (strcmp(conn_option->keyword, "port") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + port = conn_option->val; + } + else if (strcmp(conn_option->keyword, "user") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + user = conn_option->val; + } + else if (strcmp(conn_option->keyword, "password") == 0) + { + if (conn_option->val != NULL && conn_option->val[0] != '\0') + { + /* ensure that this is a template */ + if (strncmp(conn_option->val, "$", 1) != 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1]))); + + neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]); + auth_token = getenv(&conn_option->val[1]); + if (!auth_token) + { + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_EXCEPTION), + errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1]))); + } + else + { + neon_log(LOG, "using auth token from environment passed via env"); + } + } + } + } + + /* + * allocate connection string in TopMemoryContext to make sure it is not + * freed + */ + oldcontext = CurrentMemoryContext; + MemoryContextSwitchTo(TopMemoryContext); + page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port); + MemoryContextSwitchTo(oldcontext); + + PQconninfoFree(conn_options); + return page_server_connstring; +} + +/* + * Module initialization function + */ +void +_PG_init(void) +{ + DefineCustomStringVariable("neon.pageserver_connstring", + "connection string to the page server", + NULL, + &page_server_connstring_raw, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + NULL, NULL, NULL); + + DefineCustomStringVariable("neon.timeline_id", + "Zenith timelineid the server is running on", + NULL, + &zenith_timeline, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_zenith_id, NULL, NULL); + + DefineCustomStringVariable("neon.tenant_id", + "Neon tenantid the server is running on", + NULL, + &zenith_tenant, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_zenith_id, NULL, NULL); + + DefineCustomBoolVariable("neon.wal_redo", + "start in wal-redo mode", + NULL, + &wal_redo, + false, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + DefineCustomIntVariable("neon.max_cluster_size", + "cluster size limit", + NULL, + &max_cluster_size, + -1, -1, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_MB, + NULL, NULL, NULL); + + relsize_hash_init(); + EmitWarningsOnPlaceholders("neon"); + + if (page_server != NULL) + neon_log(ERROR, "libpagestore already loaded"); + + neon_log(PageStoreTrace, "libpagestore already loaded"); + page_server = &api; + + /* substitute password in pageserver_connstring */ + page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); + + /* Is there more correct way to pass CustomGUC to postgres code? */ + zenith_timeline_walproposer = zenith_timeline; + zenith_tenant_walproposer = zenith_tenant; + + /* + * Walproposer instructs safekeeper which pageserver to use for + * replication + */ + zenith_pageserver_connstring_walproposer = page_server_connstring; + + if (wal_redo) + { + neon_log(PageStoreTrace, "set inmem_smgr hook"); + smgr_hook = smgr_inmem; + smgr_init_hook = smgr_init_inmem; + } + else if (page_server_connstring && page_server_connstring[0]) + { + neon_log(PageStoreTrace, "set neon_smgr hook"); + smgr_hook = smgr_zenith; + smgr_init_hook = smgr_init_zenith; + dbsize_hook = zenith_dbsize; + } +} diff --git a/contrib/neon/neon--1.0.sql b/contrib/neon/neon--1.0.sql new file mode 100644 index 00000000000..34f1ba78d4f --- /dev/null +++ b/contrib/neon/neon--1.0.sql @@ -0,0 +1,17 @@ +\echo Use "CREATE EXTENSION neon" to load this file. \quit + +CREATE FUNCTION pg_cluster_size() +RETURNS bigint +AS 'MODULE_PATHNAME', 'pg_cluster_size' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION backpressure_lsns( + OUT received_lsn pg_lsn, + OUT disk_consistent_lsn pg_lsn, + OUT remote_consistent_lsn pg_lsn +) +RETURNS record +AS 'MODULE_PATHNAME', 'backpressure_lsns' +LANGUAGE C STRICT +PARALLEL UNSAFE; diff --git a/contrib/neon/neon.c b/contrib/neon/neon.c new file mode 100644 index 00000000000..c7c176dba7a --- /dev/null +++ b/contrib/neon/neon.c @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------- + * + * neon.c + * Utility functions to expose neon specific information to user + * + * IDENTIFICATION + * contrib/neon/neon.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "fmgr.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "catalog/pg_type.h" +#include "replication/walsender.h" +#include "replication/walproposer.h" +#include "funcapi.h" +#include "access/htup_details.h" +#include "utils/pg_lsn.h" + +PG_FUNCTION_INFO_V1(pg_cluster_size); +PG_FUNCTION_INFO_V1(backpressure_lsns); + +Datum +pg_cluster_size(PG_FUNCTION_ARGS) +{ + int64 size; + + size = GetZenithCurrentClusterSize(); + + if (size == 0) + PG_RETURN_NULL(); + + PG_RETURN_INT64(size); +} + + +Datum +backpressure_lsns(PG_FUNCTION_ARGS) +{ + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + Datum values[3]; + bool nulls[3]; + TupleDesc tupdesc; + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); + + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0); + tupdesc = BlessTupleDesc(tupdesc); + + MemSet(nulls, 0, sizeof(nulls)); + values[0] = LSNGetDatum(writePtr); + values[1] = LSNGetDatum(flushPtr); + values[2] = LSNGetDatum(applyPtr); + + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} diff --git a/contrib/neon/neon.control b/contrib/neon/neon.control new file mode 100644 index 00000000000..84f79881c1e --- /dev/null +++ b/contrib/neon/neon.control @@ -0,0 +1,4 @@ +# neon extension +comment = 'cloud storage for PostgreSQL' +default_version = '1.0' +module_pathname = '$libdir/neon' diff --git a/contrib/neon/pagestore_client.h b/contrib/neon/pagestore_client.h new file mode 100644 index 00000000000..93ea6771eb9 --- /dev/null +++ b/contrib/neon/pagestore_client.h @@ -0,0 +1,221 @@ +/*------------------------------------------------------------------------- + * + * pagestore_client.h + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * contrib/neon/pagestore_client.h + * + *------------------------------------------------------------------------- + */ +#ifndef pageserver_h +#define pageserver_h + +#include "postgres.h" + +#include "access/xlogdefs.h" +#include "storage/relfilenode.h" +#include "storage/block.h" +#include "storage/smgr.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/memutils.h" + +#include "pg_config.h" + +typedef enum +{ + /* pagestore_client -> pagestore */ + T_ZenithExistsRequest = 0, + T_ZenithNblocksRequest, + T_ZenithGetPageRequest, + T_ZenithDbSizeRequest, + + /* pagestore -> pagestore_client */ + T_ZenithExistsResponse = 100, + T_ZenithNblocksResponse, + T_ZenithGetPageResponse, + T_ZenithErrorResponse, + T_ZenithDbSizeResponse, +} ZenithMessageTag; + + + +/* base struct for c-style inheritance */ +typedef struct +{ + ZenithMessageTag tag; +} ZenithMessage; + +#define messageTag(m) (((const ZenithMessage *)(m))->tag) + +/* + * supertype of all the Zenith*Request structs below + * + * If 'latest' is true, we are requesting the latest page version, and 'lsn' + * is just a hint to the server that we know there are no versions of the page + * (or relation size, for exists/nblocks requests) later than the 'lsn'. + */ +typedef struct +{ + ZenithMessageTag tag; + bool latest; /* if true, request latest page version */ + XLogRecPtr lsn; /* request page version @ this LSN */ +} ZenithRequest; + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; +} ZenithExistsRequest; + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; +} ZenithNblocksRequest; + + +typedef struct +{ + ZenithRequest req; + Oid dbNode; +} ZenithDbSizeRequest; + + +typedef struct +{ + ZenithRequest req; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; +} ZenithGetPageRequest; + +/* supertype of all the Zenith*Response structs below */ +typedef struct +{ + ZenithMessageTag tag; +} ZenithResponse; + +typedef struct +{ + ZenithMessageTag tag; + bool exists; +} ZenithExistsResponse; + +typedef struct +{ + ZenithMessageTag tag; + uint32 n_blocks; +} ZenithNblocksResponse; + +typedef struct +{ + ZenithMessageTag tag; + char page[FLEXIBLE_ARRAY_MEMBER]; +} ZenithGetPageResponse; + +typedef struct +{ + ZenithMessageTag tag; + int64 db_size; +} ZenithDbSizeResponse; + +typedef struct +{ + ZenithMessageTag tag; + char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */ +} ZenithErrorResponse; + +extern StringInfoData zm_pack_request(ZenithRequest *msg); +extern ZenithResponse *zm_unpack_response(StringInfo s); +extern char *zm_to_string(ZenithMessage *msg); + +/* + * API + */ + +typedef struct +{ + ZenithResponse *(*request) (ZenithRequest *request); +} page_server_api; + +extern page_server_api *page_server; + +extern char *page_server_connstring; +extern char *zenith_timeline; +extern char *zenith_tenant; +extern bool wal_redo; +extern int32 max_cluster_size; + +extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode); +extern void smgr_init_zenith(void); + +extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); +extern void smgr_init_inmem(void); +extern void smgr_shutdown_inmem(void); + +/* zenith storage manager functionality */ + +extern void zenith_init(void); +extern void zenith_open(SMgrRelation reln); +extern void zenith_close(SMgrRelation reln, ForkNumber forknum); +extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum); +extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); + +extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +extern void zenith_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); +extern int64 zenith_dbsize(Oid dbNode); +extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); + +/* zenith wal-redo storage manager functionality */ + +extern void inmem_init(void); +extern void inmem_open(SMgrRelation reln); +extern void inmem_close(SMgrRelation reln, ForkNumber forknum); +extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum); +extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void inmem_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); +extern void inmem_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); + + +/* utils for zenith relsize cache */ +extern void relsize_hash_init(void); +extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size); +extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); +extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); + +#endif diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c new file mode 100644 index 00000000000..4f5706ed3ee --- /dev/null +++ b/contrib/neon/pagestore_smgr.c @@ -0,0 +1,1698 @@ +/*------------------------------------------------------------------------- + * + * pagestore_smgr.c + * + * + * + * Temporary and unlogged rels + * --------------------------- + * + * Temporary and unlogged tables are stored locally, by md.c. The functions + * here just pass the calls through to corresponding md.c functions. + * + * Index build operations that use the buffer cache are also handled locally, + * just like unlogged tables. Such operations must be marked by calling + * smgr_start_unlogged_build() and friends. + * + * In order to know what relations are permanent and which ones are not, we + * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set + * by smgropen() callers, when they have the relcache entry at hand. However, + * sometimes we need to open an SmgrRelation for a relation without the + * relcache. That is needed when we evict a buffer; we might not have the + * SmgrRelation for that relation open yet. To deal with that, the + * 'relpersistence' can be left to zero, meaning we don't know if it's + * permanent or not. Most operations are not allowed with relpersistence==0, + * but smgrwrite() does work, which is what we need for buffer eviction. and + * smgrunlink() so that a backend doesn't need to have the relcache entry at + * transaction commit, where relations that were dropped in the transaction + * are unlinked. + * + * If smgrwrite() is called and smgr_relpersistence == 0, we check if the + * relation file exists locally or not. If it does exist, we assume it's an + * unlogged relation and write the page there. Otherwise it must be a + * permanent relation, WAL-logged and stored on the page server, and we ignore + * the write like we do for permanent relations. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/pagestore_smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "access/xloginsert.h" +#include "access/xlogrecovery.h" +#include "access/xlog_internal.h" +#include "catalog/pg_class.h" +#include "pagestore_client.h" +#include "pagestore_client.h" +#include "storage/smgr.h" +#include "access/xlogdefs.h" +#include "postmaster/interrupt.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/md.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "catalog/pg_tablespace_d.h" +#include "postmaster/autovacuum.h" + +/* + * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API + * calls to md.c, and *also* do the calls to the Page Server. On every + * read, compare the versions we read from local disk and Page Server, + * and Assert that they are identical. + */ +/* #define DEBUG_COMPARE_LOCAL */ + +#ifdef DEBUG_COMPARE_LOCAL +#include "access/nbtree.h" +#include "storage/bufpage.h" +#include "access/xlog_internal.h" + +static char *hexdump_page(char *page); +#endif + +#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId) + +const int SmgrTrace = DEBUG5; + +page_server_api *page_server; + +/* GUCs */ +char *page_server_connstring; // with substituted password +char *zenith_timeline; +char *zenith_tenant; +bool wal_redo = false; +int32 max_cluster_size; + +/* unlogged relation build states */ +typedef enum +{ + UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, + UNLOGGED_BUILD_PHASE_1, + UNLOGGED_BUILD_PHASE_2, + UNLOGGED_BUILD_NOT_PERMANENT +} UnloggedBuildPhase; + +static SMgrRelation unlogged_build_rel = NULL; +static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + +StringInfoData +zm_pack_request(ZenithRequest *msg) +{ + StringInfoData s; + + initStringInfo(&s); + pq_sendbyte(&s, msg->tag); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_ZenithExistsRequest: + { + ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_ZenithNblocksRequest: + { + ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_ZenithDbSizeRequest: + { + ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->dbNode); + + break; + } + case T_ZenithGetPageRequest: + { + ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + + pq_sendbyte(&s, msg_req->req.latest); + pq_sendint64(&s, msg_req->req.lsn); + pq_sendint32(&s, msg_req->rnode.spcNode); + pq_sendint32(&s, msg_req->rnode.dbNode); + pq_sendint32(&s, msg_req->rnode.relNode); + pq_sendbyte(&s, msg_req->forknum); + pq_sendint32(&s, msg_req->blkno); + + break; + } + + /* pagestore -> pagestore_client. We never need to create these. */ + case T_ZenithExistsResponse: + case T_ZenithNblocksResponse: + case T_ZenithGetPageResponse: + case T_ZenithErrorResponse: + case T_ZenithDbSizeResponse: + default: + elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag); + break; + } + return s; +} + +ZenithResponse * +zm_unpack_response(StringInfo s) +{ + ZenithMessageTag tag = pq_getmsgbyte(s); + ZenithResponse *resp = NULL; + + switch (tag) + { + /* pagestore -> pagestore_client */ + case T_ZenithExistsResponse: + { + ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse)); + + msg_resp->tag = tag; + msg_resp->exists = pq_getmsgbyte(s); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithNblocksResponse: + { + ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse)); + + msg_resp->tag = tag; + msg_resp->n_blocks = pq_getmsgint(s, 4); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithGetPageResponse: + { + ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ); + + msg_resp->tag = tag; + /* XXX: should be varlena */ + memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithDbSizeResponse: + { + ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse)); + + msg_resp->tag = tag; + msg_resp->db_size = pq_getmsgint64(s); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + case T_ZenithErrorResponse: + { + ZenithErrorResponse *msg_resp; + size_t msglen; + const char *msgtext; + + msgtext = pq_getmsgrawstring(s); + msglen = strlen(msgtext); + + msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1); + msg_resp->tag = tag; + memcpy(msg_resp->message, msgtext, msglen + 1); + pq_getmsgend(s); + + resp = (ZenithResponse *) msg_resp; + break; + } + + /* + * pagestore_client -> pagestore + * + * We create these ourselves, and don't need to decode them. + */ + case T_ZenithExistsRequest: + case T_ZenithNblocksRequest: + case T_ZenithGetPageRequest: + case T_ZenithDbSizeRequest: + default: + elog(ERROR, "unexpected zenith message tag 0x%02x", tag); + break; + } + + return resp; +} + +/* dump to json for debugging / error reporting purposes */ +char * +zm_to_string(ZenithMessage *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_ZenithExistsRequest: + { + ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_ZenithNblocksRequest: + { + ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + case T_ZenithGetPageRequest: + { + ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\""); + appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", + msg_req->rnode.spcNode, + msg_req->rnode.dbNode, + msg_req->rnode.relNode); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithDbSizeRequest: + { + ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\""); + appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfoChar(&s, '}'); + break; + } + + + /* pagestore -> pagestore_client */ + case T_ZenithExistsResponse: + { + ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\""); + appendStringInfo(&s, ", \"exists\": %d}", + msg_resp->exists + ); + appendStringInfoChar(&s, '}'); + + break; + } + case T_ZenithNblocksResponse: + { + ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks + ); + appendStringInfoChar(&s, '}'); + + break; + } + case T_ZenithGetPageResponse: + { +#if 0 + ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg; +#endif + + appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\""); + appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithErrorResponse: + { + ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg; + + /* FIXME: escape double-quotes in the message */ + appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\""); + appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); + appendStringInfoChar(&s, '}'); + break; + } + case T_ZenithDbSizeResponse: + { + ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\""); + appendStringInfo(&s, ", \"db_size\": %ld}", + msg_resp->db_size + ); + appendStringInfoChar(&s, '}'); + + break; + } + + default: + appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); + } + return s.data; +} + +/* + * Wrapper around log_newpage() that makes a temporary copy of the block and + * WAL-logs that. This makes it safe to use while holding only a shared lock + * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint + * directly because it skips the logging if the LSN is new enough. + */ +static XLogRecPtr +log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, + Page page, bool page_std) +{ + PGAlignedBlock copied_buffer; + + memcpy(copied_buffer.data, page, BLCKSZ); + return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std); +} + +/* + * Is 'buffer' identical to a freshly initialized empty heap page? + */ +static bool +PageIsEmptyHeapPage(char *buffer) +{ + PGAlignedBlock empty_page; + + PageInit((Page) empty_page.data, BLCKSZ, 0); + + return memcmp(buffer, empty_page.data, BLCKSZ) == 0; +} + +static void +zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +{ + XLogRecPtr lsn = PageGetLSN(buffer); + + if (ShutdownRequestPending) + return; + + /* + * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM + * changes are not WAL-logged when the changes are made, so this is our + * last chance to log them, otherwise they're lost. That's OK for + * correctness, the non-logged updates are not critical. But we want to + * have a reasonably up-to-date VM and FSM in the page server. + */ + if (forknum == FSM_FORKNUM && !RecoveryInProgress()) + { + /* FSM is never WAL-logged and we don't care. */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + ereport(SmgrTrace, + (errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress()) + { + /* + * Always WAL-log vm. We should never miss clearing visibility map + * bits. + * + * TODO Is it too bad for performance? Hopefully we do not evict + * actively used vm too often. + */ + XLogRecPtr recptr; + + recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); + XLogFlush(recptr); + lsn = recptr; + + ereport(SmgrTrace, + (errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + else if (lsn == InvalidXLogRecPtr) + { + /* + * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages, + * and we can just ignore that in Zenith. We do need to remember the new size, + * though, so that smgrnblocks() returns the right answer after the rel has + * been extended. We rely on the relsize cache for that. + * + * A completely empty heap page doesn't need to be WAL-logged, either. The + * heapam can leave such a page behind, if e.g. an insert errors out after + * initializing the page, but before it has inserted the tuple and WAL-logged + * the change. When we read the page from the page server, it will come back + * as all-zeros. That's OK, the heapam will initialize an all-zeros page on + * first use. + * + * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies + * that the page was not WAL-logged, and its contents will be lost when it's + * evicted. + */ + if (PageIsNew(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else if (PageIsEmptyHeapPage(buffer)) + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + else + { + ereport(PANIC, + (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum))); + } + } + else + { + ereport(SmgrTrace, + (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, LSN_FORMAT_ARGS(lsn)))); + } + + /* + * Remember the LSN on this page. When we read the page again, we must + * read the same or newer version of it. + */ + SetLastWrittenPageLSN(lsn); +} + + +/* + * zenith_init() -- Initialize private state + */ +void +zenith_init(void) +{ + /* noop */ +#ifdef DEBUG_COMPARE_LOCAL + mdinit(); +#endif +} + +/* + * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position + * to physical position in WAL. It always adds SizeOfXLogShortPHD: + * seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + * so even if there are no records on the page, offset will be SizeOfXLogShortPHD. + * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. + */ +static XLogRecPtr +zm_adjust_lsn(XLogRecPtr lsn) +{ + /* + * If lsn points to the beging of first record on page or segment, then + * "return" it back to the page origin + */ + if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD) + { + lsn -= SizeOfXLogShortPHD; + } + else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD) + { + lsn -= SizeOfXLogLongPHD; + } + return lsn; +} + +/* + * Return LSN for requesting pages and number of blocks from page server + */ +static XLogRecPtr +zenith_get_request_lsn(bool *latest) +{ + XLogRecPtr lsn; + + if (RecoveryInProgress()) + { + *latest = false; + lsn = GetXLogReplayRecPtr(NULL); + elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + } + else if (am_walsender) + { + *latest = true; + lsn = InvalidXLogRecPtr; + elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 "); + } + else + { + XLogRecPtr flushlsn; + + /* + * Use the latest LSN that was evicted from the buffer cache. Any + * pages modified by later WAL records must still in the buffer cache, + * so our request cannot concern those. + */ + *latest = true; + lsn = GetLastWrittenPageLSN(); + Assert(lsn != InvalidXLogRecPtr); + elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ", + (uint32) ((lsn) >> 32), (uint32) (lsn)); + + lsn = zm_adjust_lsn(lsn); + + /* + * Is it possible that the last-written LSN is ahead of last flush + * LSN? Generally not, we shouldn't evict a page from the buffer cache + * before all its modifications have been safely flushed. That's the + * "WAL before data" rule. However, such case does exist at index building, + * _bt_blwritepage logs the full page without flushing WAL before + * smgrextend (files are fsynced before build ends). + */ + flushlsn = GetFlushRecPtr(NULL); + if (lsn > flushlsn) + { + elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + (uint32) (lsn >> 32), (uint32) lsn, + (uint32) (flushlsn >> 32), (uint32) flushlsn); + XLogFlush(lsn); + } + } + + return lsn; +} + + +/* + * zenith_exists() -- Does the physical file exist? + */ +bool +zenith_exists(SMgrRelation reln, ForkNumber forkNum) +{ + bool exists; + ZenithResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* + * We don't know if it's an unlogged rel stored locally, or permanent + * rel stored in the page server. First check if it exists locally. + * If it does, great. Otherwise check if it exists in the page server. + */ + if (mdexists(reln, forkNum)) + return true; + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdexists(reln, forkNum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks)) + { + return true; + } + + /* + * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server + * will error out if you check that, because the whole dbdir for tablespace + * 0, db 0 doesn't exists. We possibly should change the page server to + * accept that and return 'false', to be consistent with mdexists(). But + * we probably also should fix pg_table_size() to not call smgrexists() + * with bogus relfilenode. + * + * For now, handle that special case here. + */ + if (reln->smgr_rnode.node.spcNode == 0 && + reln->smgr_rnode.node.dbNode == 0 && + reln->smgr_rnode.node.relNode == 0) + { + return false; + } + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithExistsRequest request = { + .req.tag = T_ZenithExistsRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forkNum + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithExistsResponse: + exists = ((ZenithExistsResponse *) resp)->exists; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + pfree(resp); + return exists; +} + +/* + * zenith_create() -- Create a new relation on zenithd storage + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdcreate(reln, forkNum, isRedo); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "Create relation %u/%u/%u.%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum); + + /* + * Newly created relation is empty, remember that in the relsize cache. + * + * FIXME: This is currently not just an optimization, but required for + * correctness. Postgres can call smgrnblocks() on the newly-created + * relation. Currently, we don't call SetLastWrittenPageLSN() when a new + * relation created, so if we didn't remember the size in the relsize + * cache, we might call smgrnblocks() on the newly-created relation before + * the creation WAL record hass been received by the page server. + */ + set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdcreate(reln, forkNum, isRedo); +#endif +} + +/* + * zenith_unlink() -- Unlink a relation. + * + * Note that we're passed a RelFileNodeBackend --- by the time this is called, + * there won't be an SMgrRelation hashtable entry anymore. + * + * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber + * to delete all forks. + * + * + * If isRedo is true, it's unsurprising for the relation to be already gone. + * Also, we should remove the file immediately instead of queuing a request + * for later, since during redo there's no possibility of creating a + * conflicting relation. + * + * Note: any failure should be reported as WARNING not ERROR, because + * we are usually not in a transaction anymore when this is called. + */ +void +zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +{ + /* + * Might or might not exist locally, depending on whether it's + * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is + * set). Try to unlink, it won't do any harm if the file doesn't + * exist. + */ + mdunlink(rnode, forkNum, isRedo); + if (!RelFileNodeBackendIsTemp(rnode)) { + forget_cached_relsize(rnode.node, forkNum); + } +} + +/* + * zenith_extend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* + * Check that the cluster size limit has not been exceeded. + * + * Temporary and unlogged relations are not included in the cluster size measured + * by the page server, so ignore those. Autovacuum processes are also exempt. + */ + if (max_cluster_size > 0 && + reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && + !IsAutoVacuumWorkerProcess()) + { + uint64 current_size = GetZenithCurrentClusterSize(); + + if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + max_cluster_size), + errhint("This limit is defined by neon.max_cluster_size GUC"))); + } + + zenith_wallog_page(reln, forkNum, blkno, buffer); + set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, blkno, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdextend(reln, forkNum, blkno, buffer, skipFsync); +#endif +} + +/* + * zenith_open() -- Initialize newly-opened relation. + */ +void +zenith_open(SMgrRelation reln) +{ + /* + * We don't have anything special to do here. Call mdopen() to let md.c + * initialize itself. That's only needed for temporary or unlogged + * relations, but it's dirt cheap so do it always to make sure the md + * fields are initialized, for debugging purposes if nothing else. + */ + mdopen(reln); + + /* no work */ + elog(SmgrTrace, "[ZENITH_SMGR] open noop"); +} + +/* + * zenith_close() -- Close the specified relation, if it isn't closed already. + */ +void +zenith_close(SMgrRelation reln, ForkNumber forknum) +{ + /* + * Let md.c close it, if it had it open. Doesn't hurt to do this + * even for permanent relations that have no local storage. + */ + mdclose(reln, forknum); +} + +/* + * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* probably shouldn't happen, but ignore it */ + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdprefetch(reln, forknum, blocknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop"); + return true; +} + +/* + * zenith_writeback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +zenith_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + switch (reln->smgr_relpersistence) + { + case 0: + /* mdwriteback() does nothing if the file doesn't exist */ + mdwriteback(reln, forknum, blocknum, nblocks); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwriteback(reln, forknum, blocknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* not implemented */ + elog(SmgrTrace, "[ZENITH_SMGR] writeback noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwriteback(reln, forknum, blocknum, nblocks); +#endif +} + +/* + * While function is defined in the zenith extension it's used within neon_test_utils directly. + * To avoid breaking tests in the runtime please keep function signature in sync. + */ +void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer) +{ + ZenithResponse *resp; + + { + ZenithGetPageRequest request = { + .req.tag = T_ZenithGetPageRequest, + .req.latest = request_latest, + .req.lsn = request_lsn, + .rnode = rnode, + .forknum = forkNum, + .blkno = blkno + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithGetPageResponse: + memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ); + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + blkno, + rnode.spcNode, + rnode.dbNode, + rnode.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + pfree(resp); +} + +/* + * zenith_read() -- Read the specified block from a relation. + */ +void +zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, + char *buffer) +{ + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrread() on rel with unknown persistence"); + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + request_lsn = zenith_get_request_lsn(&latest); + zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); + +#ifdef DEBUG_COMPARE_LOCAL + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + char pageserver_masked[BLCKSZ]; + char mdbuf[BLCKSZ]; + char mdbuf_masked[BLCKSZ]; + + mdread(reln, forkNum, blkno, mdbuf); + + memcpy(pageserver_masked, buffer, BLCKSZ); + memcpy(mdbuf_masked, mdbuf, BLCKSZ); + + if (PageIsNew(mdbuf)) + { + if (!PageIsNew(pageserver_masked)) + { + elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(buffer)); + } + } + else if (PageIsNew(buffer)) + { + elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf)); + } + else if (PageGetSpecialSize(mdbuf) == 0) + { + /* assume heap */ + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + { + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + { + /* assume btree */ + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); + + if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + { + elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + blkno, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forkNum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + hexdump_page(mdbuf_masked), + hexdump_page(pageserver_masked)); + } + } + } + } +#endif +} + +#ifdef DEBUG_COMPARE_LOCAL +static char * +hexdump_page(char *page) +{ + StringInfoData result; + + initStringInfo(&result); + + for (int i = 0; i < BLCKSZ; i++) + { + if (i % 8 == 0) + appendStringInfo(&result, " "); + if (i % 40 == 0) + appendStringInfo(&result, "\n"); + appendStringInfo(&result, "%02x", (unsigned char) (page[i])); + } + + return result.data; +} +#endif + +/* + * zenith_write() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + /* This is a bit tricky. Check if the relation exists locally */ + if (mdexists(reln, forknum)) + { + /* It exists locally. Guess it's unlogged then. */ + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + + /* + * We could set relpersistence now that we have determined + * that it's local. But we don't dare to do it, because that + * would immediately allow reads as well, which shouldn't + * happen. We could cache it with a different 'relpersistence' + * value, but this isn't performance critical. + */ + return; + } + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdwrite(reln, forknum, blocknum, buffer, skipFsync); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + zenith_wallog_page(reln, forknum, blocknum, buffer); + + lsn = PageGetLSN(buffer); + elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, blocknum, + (uint32) (lsn >> 32), (uint32) lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif +} + +/* + * zenith_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +zenith_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + ZenithResponse *resp; + BlockNumber n_blocks; + bool latest; + XLogRecPtr request_lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + return mdnblocks(reln, forknum); + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks)) + { + elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, n_blocks); + return n_blocks; + } + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithNblocksRequest request = { + .req.tag = T_ZenithNblocksRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .rnode = reln->smgr_rnode.node, + .forknum = forknum, + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithNblocksResponse: + n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); + + elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + forknum, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + n_blocks); + + pfree(resp); + return n_blocks; +} + +/* + * zenith_db_size() -- Get the size of the database in bytes. + */ +int64 +zenith_dbsize(Oid dbNode) +{ + ZenithResponse *resp; + int64 db_size; + XLogRecPtr request_lsn; + bool latest; + + request_lsn = zenith_get_request_lsn(&latest); + { + ZenithDbSizeRequest request = { + .req.tag = T_ZenithDbSizeRequest, + .req.latest = latest, + .req.lsn = request_lsn, + .dbNode = dbNode, + }; + + resp = page_server->request((ZenithRequest *) &request); + } + + switch (resp->tag) + { + case T_ZenithDbSizeResponse: + db_size = ((ZenithDbSizeResponse *) resp)->db_size; + break; + + case T_ZenithErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not read db size of db %u from page server at lsn %X/%08X", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn), + errdetail("page server returned error: %s", + ((ZenithErrorResponse *) resp)->message))); + break; + + default: + elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + } + + elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, + (uint32) (request_lsn >> 32), (uint32) request_lsn, + db_size); + + pfree(resp); + return db_size; +} + +/* + * zenith_truncate() -- Truncate relation to specified number of blocks. + */ +void +zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + XLogRecPtr lsn; + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdtruncate(reln, forknum, nblocks); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks); + + /* + * Truncating a relation drops all its buffers from the buffer cache + * without calling smgrwrite() on them. But we must account for that in + * our tracking of last-written-LSN all the same: any future smgrnblocks() + * request must return the new size after the truncation. We don't know + * what the LSN of the truncation record was, so be conservative and use + * the most recently inserted WAL record's LSN. + */ + lsn = GetXLogInsertRecPtr(); + + lsn = zm_adjust_lsn(lsn); + + /* + * Flush it, too. We don't actually care about it here, but let's uphold + * the invariant that last-written LSN <= flush LSN. + */ + XLogFlush(lsn); + + SetLastWrittenPageLSN(lsn); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdtruncate(reln, forknum, nblocks); +#endif +} + +/* + * zenith_immedsync() -- Immediately sync a relation to stable storage. + * + * Note that only writes already issued are synced; this routine knows + * nothing of dirty buffers that may exist inside the buffer manager. We + * sync active and inactive segments; smgrDoPendingSyncs() relies on this. + * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of + * some segment, then mdtruncate() renders that segment inactive. If we + * crash before the next checkpoint syncs the newly-inactive segment, that + * segment may survive recovery, reintroducing unwanted data into the table. + */ +void +zenith_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdimmedsync(reln, forknum); + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop"); + +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdimmedsync(reln, forknum); +#endif +} + +/* + * zenith_start_unlogged_build() -- Starting build operation on a rel. + * + * Some indexes are built in two phases, by first populating the table with + * regular inserts, using the shared buffer cache but skipping WAL-logging, + * and WAL-logging the whole relation after it's done. Zenith relies on the + * WAL to reconstruct pages, so we cannot use the page server in the + * first phase when the changes are not logged. + */ +static void +zenith_start_unlogged_build(SMgrRelation reln) +{ + /* + * Currently, there can be only one unlogged relation build operation in + * progress at a time. That's enough for the current usage. + */ + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + elog(ERROR, "unlogged relation build is already in progress"); + Assert(unlogged_build_rel == NULL); + + ereport(SmgrTrace, + (errmsg("starting unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + switch (reln->smgr_relpersistence) + { + case 0: + elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; + return; + + default: + elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + if (smgrnblocks(reln, MAIN_FORKNUM) != 0) + elog(ERROR, "cannot perform unlogged index build, index is not empty "); + + unlogged_build_rel = reln; + unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; + + /* Make the relation look like it's unlogged */ + reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; + + /* + * FIXME: should we pass isRedo true to create the tablespace dir if it + * doesn't exist? Is it needed? + */ + mdcreate(reln, MAIN_FORKNUM, false); +} + +/* + * zenith_finish_unlogged_build_phase_1() + * + * Call this after you have finished populating a relation in unlogged mode, + * before you start WAL-logging it. + */ +static void +zenith_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) + return; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; +} + +/* + * zenith_end_unlogged_build() -- Finish an unlogged rel build. + * + * Call this after you have finished WAL-logging an relation that was + * first populated without WAL-logging. + * + * This removes the local copy of the rel, since it's now been fully + * WAL-logged and is present in the page server. + */ +static void +zenith_end_unlogged_build(SMgrRelation reln) +{ + Assert(unlogged_build_rel == reln); + + ereport(SmgrTrace, + (errmsg("ending unlogged build of relation %u/%u/%u", + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode))); + + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) + { + RelFileNodeBackend rnode; + + Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); + Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); + + /* Make the relation look permanent again */ + reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; + + /* Remove local copy */ + rnode = reln->smgr_rnode; + for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", + rnode.node.spcNode, + rnode.node.dbNode, + rnode.node.relNode, + forknum); + + forget_cached_relsize(rnode.node, forknum); + mdclose(reln, forknum); + /* use isRedo == true, so that we drop it immediately */ + mdunlink(rnode, forknum, true); + } + } + + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; +} + +static void +AtEOXact_zenith(XactEvent event, void *arg) +{ + switch (event) + { + case XACT_EVENT_ABORT: + case XACT_EVENT_PARALLEL_ABORT: + + /* + * Forget about any build we might have had in progress. The local + * file will be unlinked by smgrDoPendingDeletes() + */ + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + break; + + case XACT_EVENT_COMMIT: + case XACT_EVENT_PARALLEL_COMMIT: + case XACT_EVENT_PREPARE: + case XACT_EVENT_PRE_COMMIT: + case XACT_EVENT_PARALLEL_PRE_COMMIT: + case XACT_EVENT_PRE_PREPARE: + if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + (errmsg("unlogged index build was not properly finished")))); + } + break; + } +} + +static const struct f_smgr zenith_smgr = +{ + .smgr_init = zenith_init, + .smgr_shutdown = NULL, + .smgr_open = zenith_open, + .smgr_close = zenith_close, + .smgr_create = zenith_create, + .smgr_exists = zenith_exists, + .smgr_unlink = zenith_unlink, + .smgr_extend = zenith_extend, + .smgr_prefetch = zenith_prefetch, + .smgr_read = zenith_read, + .smgr_write = zenith_write, + .smgr_writeback = zenith_writeback, + .smgr_nblocks = zenith_nblocks, + .smgr_truncate = zenith_truncate, + .smgr_immedsync = zenith_immedsync, + + .smgr_start_unlogged_build = zenith_start_unlogged_build, + .smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1, + .smgr_end_unlogged_build = zenith_end_unlogged_build, +}; + + +const f_smgr * +smgr_zenith(BackendId backend, RelFileNode rnode) +{ + + /* Don't use page server for temp relations */ + if (backend != InvalidBackendId) + return smgr_standard(backend, rnode); + else + return &zenith_smgr; +} + +void +smgr_init_zenith(void) +{ + RegisterXactCallback(AtEOXact_zenith, NULL); + + smgr_init_standard(); + zenith_init(); +} diff --git a/contrib/neon/relsize_cache.c b/contrib/neon/relsize_cache.c new file mode 100644 index 00000000000..0ff221be22c --- /dev/null +++ b/contrib/neon/relsize_cache.c @@ -0,0 +1,186 @@ +/*------------------------------------------------------------------------- + * + * relsize_cache.c + * Relation size cache for better zentih performance. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * contrib/neon/relsize_cache.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pagestore_client.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/lwlock.h" +#include "storage/ipc.h" +#include "storage/shmem.h" +#include "catalog/pg_tablespace_d.h" +#include "utils/dynahash.h" +#include "utils/guc.h" +#include "miscadmin.h" + + +typedef struct +{ + RelFileNode rnode; + ForkNumber forknum; +} RelTag; + +typedef struct +{ + RelTag tag; + BlockNumber size; +} RelSizeEntry; + +static HTAB *relsize_hash; +static LWLockId relsize_lock; +static int relsize_hash_size; +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static shmem_request_hook_type prev_shmem_request_hook = NULL; + +/* + * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, + * which seems reasonable. + */ +#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) + +static void +zenith_smgr_shmem_startup(void) +{ + static HASHCTL info; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); + info.keysize = sizeof(RelTag); + info.entrysize = sizeof(RelSizeEntry); + relsize_hash = ShmemInitHash("neon_relsize", + relsize_hash_size, relsize_hash_size, + &info, + HASH_ELEM | HASH_BLOBS); + LWLockRelease(AddinShmemInitLock); +} + +bool +get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) +{ + bool found = false; + + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_SHARED); + entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); + if (entry != NULL) + { + *size = entry->size; + found = true; + } + LWLockRelease(relsize_lock); + } + return found; +} + +void +set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + RelSizeEntry *entry; + bool found; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); + if (!found || entry->size < size) + entry->size = size; + LWLockRelease(relsize_lock); + } +} + +void +forget_cached_relsize(RelFileNode rnode, ForkNumber forknum) +{ + if (relsize_hash_size > 0) + { + RelTag tag; + + tag.rnode = rnode; + tag.forknum = forknum; + LWLockAcquire(relsize_lock, LW_EXCLUSIVE); + hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); + LWLockRelease(relsize_lock); + } +} + +static void relsize_shmem_request(void); + +void +relsize_hash_init(void) +{ + DefineCustomIntVariable("neon.relsize_hash_size", + "Sets the maximum number of cached relation sizes for neon", + NULL, + &relsize_hash_size, + DEFAULT_RELSIZE_HASH_SIZE, + 0, + INT_MAX, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + + if (relsize_hash_size > 0) + { + shmem_request_hook = relsize_shmem_request; + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = zenith_smgr_shmem_startup; + } +} + + + +/* + * shmem_request hook: request additional shared resources. We'll allocate or + * attach to the shared resources in pgss_shmem_startup(). + */ +static void +relsize_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestNamedLWLockTranche("neon_relsize", 1); +} diff --git a/contrib/neon_test_utils/Makefile b/contrib/neon_test_utils/Makefile new file mode 100644 index 00000000000..bd618e6d96e --- /dev/null +++ b/contrib/neon_test_utils/Makefile @@ -0,0 +1,25 @@ +# contrib/neon_test_utils/Makefile + + +MODULE_big = neon_test_utils +OBJS = \ + $(WIN32RES) \ + neontest.o + +EXTENSION = neon_test_utils +DATA = neon_test_utils--1.0.sql +PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" + +EXTRA_INSTALL=contrib/neon + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +PG_CPPFLAGS = -I$(top_srcdir)/contrib +subdir = contrib/neon_test_utils +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/neon_test_utils/neon_test_utils--1.0.sql b/contrib/neon_test_utils/neon_test_utils--1.0.sql new file mode 100644 index 00000000000..402981a9a66 --- /dev/null +++ b/contrib/neon_test_utils/neon_test_utils--1.0.sql @@ -0,0 +1,29 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit + +CREATE FUNCTION test_consume_xids(nxids int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_xids' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION clear_buffer_cache() +RETURNS VOID +AS 'MODULE_PATHNAME', 'clear_buffer_cache' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) +RETURNS bytea +AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION neon_xlogflush(lsn pg_lsn) +RETURNS VOID +AS 'MODULE_PATHNAME', 'neon_xlogflush' +LANGUAGE C PARALLEL UNSAFE; diff --git a/contrib/neon_test_utils/neon_test_utils.control b/contrib/neon_test_utils/neon_test_utils.control new file mode 100644 index 00000000000..94e67205039 --- /dev/null +++ b/contrib/neon_test_utils/neon_test_utils.control @@ -0,0 +1,5 @@ +# neon_test_utils extension +comment = 'helpers for neon testing and debugging' +default_version = '1.0' +module_pathname = '$libdir/neon_test_utils' +relocatable = true diff --git a/contrib/neon_test_utils/neontest.c b/contrib/neon_test_utils/neontest.c new file mode 100644 index 00000000000..a3e730efe27 --- /dev/null +++ b/contrib/neon_test_utils/neontest.c @@ -0,0 +1,304 @@ +/*------------------------------------------------------------------------- + * + * neontest.c + * Helpers for neon testing and debugging + * + * IDENTIFICATION + * contrib/neon_test_utils/neontest.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relation.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/namespace.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "utils/builtins.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/varlena.h" +#include "neon/pagestore_client.h" + +PG_MODULE_MAGIC; + +extern void _PG_init(void); + +PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(clear_buffer_cache); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); +PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); +PG_FUNCTION_INFO_V1(neon_xlogflush); + +/* + * Linkage to functions in zenith module. + * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c + */ +typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, + XLogRecPtr request_lsn, bool request_latest, char *buffer); + +static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; + +/* + * Module initialize function: fetch function pointers for cross-module calls. + */ +void +_PG_init(void) +{ + /* Asserts verify that typedefs above match original declarations */ + AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type); + zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type) + load_external_function("$libdir/neon", "zenith_read_at_lsn", + true, NULL); +} + +#define zenith_read_at_lsn zenith_read_at_lsn_ptr + +/* + * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. + */ +Datum +test_consume_xids(PG_FUNCTION_ARGS) +{ + int32 nxids = PG_GETARG_INT32(0); + TransactionId topxid; + FullTransactionId fullxid; + TransactionId xid; + TransactionId targetxid; + + /* make sure we have a top-XID first */ + topxid = GetTopTransactionId(); + + xid = ReadNextTransactionId(); + + targetxid = xid + nxids; + while (targetxid < FirstNormalTransactionId) + targetxid++; + + while (TransactionIdPrecedes(xid, targetxid)) + { + fullxid = GetNewTransactionId(true); + xid = XidFromFullTransactionId(fullxid); + elog(DEBUG1, "topxid: %u xid: %u", topxid, xid); + } + + PG_RETURN_VOID(); +} + +/* + * Flush the buffer cache, evicting all pages that are not currently pinned. + */ +Datum +clear_buffer_cache(PG_FUNCTION_ARGS) +{ + bool save_zenith_test_evict; + + /* + * Temporarily set the zenith_test_evict GUC, so that when we pin and + * unpin a buffer, the buffer is evicted. We use that hack to evict all + * buffers, as there is no explicit "evict this buffer" function in the + * buffer manager. + */ + save_zenith_test_evict = zenith_test_evict; + zenith_test_evict = true; + PG_TRY(); + { + /* Scan through all the buffers */ + for (int i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr; + uint32 buf_state; + Buffer bufferid; + bool isvalid; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blocknum; + + /* Peek into the buffer header to see what page it holds. */ + bufHdr = GetBufferDescriptor(i); + buf_state = LockBufHdr(bufHdr); + + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + isvalid = true; + else + isvalid = false; + bufferid = BufferDescriptorGetBuffer(bufHdr); + rnode = bufHdr->tag.rnode; + forknum = bufHdr->tag.forkNum; + blocknum = bufHdr->tag.blockNum; + + UnlockBufHdr(bufHdr, buf_state); + + /* + * Pin the buffer, and release it again. Because we have + * zenith_test_evict==true, this will evict the page from + * the buffer cache if no one else is holding a pin on it. + */ + if (isvalid) + { + if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid)) + ReleaseBuffer(bufferid); + } + } + } + PG_FINALLY(); + { + /* restore the GUC */ + zenith_test_evict = save_zenith_test_evict; + } + PG_END_TRY(); + + PG_RETURN_VOID(); +} + + +/* + * Reads the page from page server without buffer cache + * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN + * NULL read lsn will result in reading the latest version. + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn(PG_FUNCTION_ARGS) +{ + bytea *raw_page; + ForkNumber forknum; + RangeVar *relrv; + Relation rel; + char *raw_page_data; + text *relname; + text *forkname; + uint32 blkno; + + bool request_latest = PG_ARGISNULL(3); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + + relname = PG_GETARG_TEXT_PP(0); + forkname = PG_GETARG_TEXT_PP(1); + blkno = PG_GETARG_UINT32(2); + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, AccessShareLock); + + /* Check that this relation has storage */ + if (rel->rd_rel->relkind == RELKIND_VIEW) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from view \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from composite type \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from foreign table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned table \"%s\"", + RelationGetRelationName(rel)))); + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("cannot get raw page from partitioned index \"%s\"", + RelationGetRelationName(rel)))); + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + + forknum = forkname_to_number(text_to_cstring(forkname)); + + /* Initialize buffer to copy to */ + raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); + + relation_close(rel, AccessShareLock); + + PG_RETURN_BYTEA_P(raw_page); +} + +/* + * Another option to read a relation page from page server without cache + * this version doesn't validate input and allows reading blocks of dropped relations + * + * Note: reading latest version will result in waiting for latest changes to reach the page server, + * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page + */ +Datum +get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) +{ + char *raw_page_data; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to use raw page functions"))); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || + PG_ARGISNULL(3) || PG_ARGISNULL(4)) + PG_RETURN_NULL(); + + { + RelFileNode rnode = { + .spcNode = PG_GETARG_OID(0), + .dbNode = PG_GETARG_OID(1), + .relNode = PG_GETARG_OID(2) + }; + + ForkNumber forknum = PG_GETARG_UINT32(3); + + uint32 blkno = PG_GETARG_UINT32(4); + bool request_latest = PG_ARGISNULL(5); + uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + + + /* Initialize buffer to copy to */ + bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page_data = VARDATA(raw_page); + + zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); + PG_RETURN_BYTEA_P(raw_page); + } +} + +/* + * Directly calls XLogFlush(lsn) to flush WAL buffers. + */ +Datum +neon_xlogflush(PG_FUNCTION_ARGS) +{ + XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogFlush(lsn); + PG_RETURN_VOID(); +} diff --git a/src/Makefile b/src/Makefile index 79e274a4769..2f32e3d5137 100644 --- a/src/Makefile +++ b/src/Makefile @@ -22,6 +22,7 @@ SUBDIRS = \ include \ interfaces \ backend/replication/libpqwalreceiver \ + backend/replication/libpqwalproposer \ backend/replication/pgoutput \ fe_utils \ bin \ diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 7d5e08c667d..738cb15cfb4 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -186,6 +186,7 @@ with_tcl = @with_tcl@ with_ssl = @with_ssl@ with_readline = @with_readline@ with_selinux = @with_selinux@ +with_libseccomp = @with_libseccomp@ with_systemd = @with_systemd@ with_gssapi = @with_gssapi@ with_krb_srvnam = @with_krb_srvnam@ diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index af6949882a1..9d55d783488 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -69,7 +69,8 @@ brin_xlog_insert_update(XLogReaderState *record, } /* need this page's blkno to store in revmap */ - regpgno = BufferGetBlockNumber(buffer); + //ZENITH XXX Don't use BufferGetBlockNumber because wal-redo doesn't pin buffer. + XLogRecGetBlockTag(record, 0, NULL, NULL, ®pgno); /* insert the index item into the page */ if (action == BLK_NEEDS_REDO) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index ea1c4184fbf..37804a7852d 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -335,6 +335,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + smgr_start_unlogged_build(index->rd_smgr); + initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); @@ -408,6 +410,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats, true); + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if WAL-logging is * required, write all pages to the WAL now. @@ -418,6 +422,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) 0, RelationGetNumberOfBlocks(index), true); } + SetLastWrittenPageLSN(XactLastRecEnd); + + smgr_end_unlogged_build(index->rd_smgr); /* * Return statistics diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 87e8366642f..d240eb829aa 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -407,6 +407,7 @@ ginRedoSplit(XLogReaderState *record) rootbuf; bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0; + XLogRedoAction action; /* * First clear incomplete-split flag on child page if this finishes a @@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record) if (!isLeaf) ginRedoClearIncompleteSplit(record, 3); - if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 0, &lbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of left page"); - if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 1, &rbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of right page"); if (isRoot) { - if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 2, &rootbuf); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of root page"); - UnlockReleaseBuffer(rootbuf); + if (rootbuf != InvalidBuffer) + UnlockReleaseBuffer(rootbuf); } - UnlockReleaseBuffer(rbuffer); - UnlockReleaseBuffer(lbuffer); + if (rbuffer != InvalidBuffer) + UnlockReleaseBuffer(rbuffer); + if (lbuffer != InvalidBuffer) + UnlockReleaseBuffer(lbuffer); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index be9b91be859..6afc711f8fe 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -40,6 +40,7 @@ #include "access/tableam.h" #include "access/xloginsert.h" #include "catalog/index.h" +#include "catalog/storage.h" #include "miscadmin.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" @@ -296,6 +297,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) Buffer buffer; Page page; + smgr_start_unlogged_build(index->rd_smgr); + /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); @@ -328,6 +331,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) gistFreeBuildBuffers(buildstate.gfbb); } + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if * WAL-logging is required, write all pages to the WAL now. @@ -338,6 +343,9 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) 0, RelationGetNumberOfBlocks(index), true); } + SetLastWrittenPageLSN(XactLastRecEnd); + + smgr_end_unlogged_build(index->rd_smgr); } /* okay, all heap tuples are indexed */ @@ -462,8 +470,13 @@ gist_indexsortbuild(GISTBuildState *state) smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, levelstate->pages[0], true); if (RelationNeedsWAL(state->indexrel)) - log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); + { + XLogRecPtr lsn; + + lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, + levelstate->pages[0], true); + SetLastWrittenPageLSN(lsn); + } pfree(levelstate->pages[0]); pfree(levelstate); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index c74fbd01049..c1ebffced89 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2155,6 +2155,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; xlhdr.t_infomask = heaptup->t_data->t_infomask; xlhdr.t_hoff = heaptup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); /* * note we mark xlhdr as belonging to buffer; if XLogInsert decides to @@ -2473,6 +2474,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; tuphdr->t_infomask = heaptup->t_data->t_infomask; + tuphdr->t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); tuphdr->t_hoff = heaptup->t_data->t_hoff; /* write bitmap [+ padding] [+ oid] + data */ @@ -2985,7 +2987,7 @@ heap_delete(Relation relation, ItemPointer tid, tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = new_xmax; - + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); if (old_key_tuple != NULL) { if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) @@ -3006,6 +3008,7 @@ heap_delete(Relation relation, ItemPointer tid, { xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader); @@ -3713,6 +3716,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, oldtup.t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data); XLogRegisterData((char *) &xlrec, SizeOfHeapLock); recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); PageSetLSN(page, recptr); @@ -4900,6 +4904,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, xlrec.infobits_set = compute_infobits(new_infomask, tuple->t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data); XLogRegisterData((char *) &xlrec, SizeOfHeapLock); /* we don't decode row locks atm, so no need to log the origin */ @@ -5949,6 +5954,7 @@ heap_abort_speculative(Relation relation, ItemPointer tid) tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = xid; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); @@ -8350,7 +8356,7 @@ log_heap_update(Relation reln, Buffer oldbuf, /* Prepare WAL data for the new page */ xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); - + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); bufflags = REGBUF_STANDARD; if (init) bufflags |= REGBUF_WILL_INIT; @@ -8387,6 +8393,7 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr.t_infomask2 = newtup->t_data->t_infomask2; xlhdr.t_infomask = newtup->t_data->t_infomask; xlhdr.t_hoff = newtup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); /* @@ -8428,6 +8435,7 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; + xlhdr_idx.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader); @@ -9060,7 +9068,7 @@ heap_xlog_delete(XLogReaderState *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); else HeapTupleHeaderSetXmin(htup, InvalidTransactionId); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); @@ -9103,7 +9111,7 @@ heap_xlog_insert(XLogReaderState *record) XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); ItemPointerSetBlockNumber(&target_tid, blkno); - ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); + ItemPointerSetOffsetNumber(&target_tid, (xlrec->flags & XLH_INSERT_IS_SPECULATIVE) ? SpecTokenOffsetNumber : xlrec->offnum); /* * The visibility map may need to be fixed even if the heap page is @@ -9161,7 +9169,7 @@ heap_xlog_insert(XLogReaderState *record) htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); htup->t_ctid = target_tid; if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, @@ -9304,7 +9312,7 @@ heap_xlog_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr->t_cid); ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); @@ -9444,7 +9452,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -9577,7 +9585,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -9718,7 +9726,7 @@ heap_xlog_lock(XLogReaderState *record) offnum); } HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 21682592478..73aacf4ba9a 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -626,7 +626,7 @@ heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode) { SMgrRelation dstrel; - dstrel = smgropen(*newrnode, rel->rd_backend); + dstrel = smgropen(*newrnode, rel->rd_backend, rel->rd_rel->relpersistence); /* * Since we copy the file directly without looking at the shared buffers, diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index b802ed247e7..033f2baab3f 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1150,7 +1150,10 @@ lazy_scan_heap(LVRelState *vacrel) else if (all_visible_according_to_vm && !PageIsAllVisible(page) && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) { - elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + /* ZENITH-XXX: all visible hint is not wal-logged + * FIXME: Replay visibilitymap changes in pageserver + */ + elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", vacrel->relname, blkno); visibilitymap_clear(vacrel->rel, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index e09f25a684c..5332488bbe5 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -655,9 +655,18 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) /* Now extend the file */ while (vm_nblocks_now < vm_nblocks) { - PageSetChecksumInplace((Page) pg.data, vm_nblocks_now); - - smgrextend(reln, VISIBILITYMAP_FORKNUM, vm_nblocks_now, pg.data, false); + /* + * ZENITH: Initialize VM pages through buffer cache to prevent loading + * them from pageserver. + */ + Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW, + RBM_ZERO_AND_LOCK, NULL); + Page page = BufferGetPage(buffer); + + PageInit((Page) page, BLCKSZ, 0); + PageSetChecksumInplace(page, vm_nblocks_now); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); vm_nblocks_now++; } diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index bfb74049d0c..542eb5c4d0b 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -85,6 +85,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + smgr_start_unlogged_build(index->rd_smgr); + /* * Initialize the meta page and root pages */ @@ -105,7 +107,6 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS); MarkBufferDirty(nullbuffer); - END_CRIT_SECTION(); UnlockReleaseBuffer(metabuffer); @@ -131,6 +132,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) SpGistUpdateMetaPage(index); + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if WAL-logging is * required, write all pages to the WAL now. @@ -141,6 +144,9 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) 0, RelationGetNumberOfBlocks(index), true); } + SetLastWrittenPageLSN(XactLastRecEnd); + + smgr_end_unlogged_build(index->rd_smgr); result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 59f94b05d4a..b6a6a982087 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -136,6 +136,7 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +uint64 predefined_sysidentifier; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -551,7 +552,16 @@ typedef struct XLogCtlData * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled. */ XLogRecPtr lastFpwDisableRecPtr; + XLogRecPtr lastWrittenPageLSN; + /* neon: copy of startup's RedoStartLSN for walproposer's use */ + XLogRecPtr RedoStartLSN; + + /* + * size of a timeline in zenith pageserver. + * used to enforce timeline size limit. + */ + uint64 zenithCurrentClusterSize; slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -634,6 +644,15 @@ static bool holdingAllLocks = false; static MemoryContext walDebugCxt = NULL; #endif + +/* + * Variables read from 'zenith.signal' file. + */ +bool ZenithRecoveryRequested = false; +XLogRecPtr zenithLastRec = InvalidXLogRecPtr; +bool zenithWriteOk = false; + + static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog, TimeLineID newTLI); @@ -4523,9 +4542,16 @@ BootStrapXLOG(void) * perhaps be useful sometimes. */ gettimeofday(&tv, NULL); - sysidentifier = ((uint64) tv.tv_sec) << 32; - sysidentifier |= ((uint64) tv.tv_usec) << 12; - sysidentifier |= getpid() & 0xFFF; + if (predefined_sysidentifier != 0) + { + sysidentifier = predefined_sysidentifier; + } + else + { + sysidentifier = ((uint64) tv.tv_sec) << 32; + sysidentifier |= ((uint64) tv.tv_usec) << 12; + sysidentifier |= getpid() & 0xFFF; + } /* page buffer must be aligned suitably for O_DIRECT */ buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ); @@ -4659,6 +4685,7 @@ BootStrapXLOG(void) ReadControlFile(); } + static char * str_time(pg_time_t tnow) { @@ -4879,6 +4906,81 @@ CheckRequiredParameterValues(void) } } +static void +readZenithSignalFile(void) +{ + int fd; + + fd = BasicOpenFile(ZENITH_SIGNAL_FILE, O_RDONLY | PG_BINARY); + if (fd >= 0) + { + struct stat statbuf; + char *content; + char prev_lsn_str[20]; + + /* Slurp the file into a string */ + if (stat(ZENITH_SIGNAL_FILE, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + ZENITH_SIGNAL_FILE))); + content = palloc(statbuf.st_size + 1); + if (read(fd, content, statbuf.st_size) != statbuf.st_size) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + ZENITH_SIGNAL_FILE))); + content[statbuf.st_size] = '\0'; + + /* Parse it */ + if (sscanf(content, "PREV LSN: %19s", prev_lsn_str) != 1) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE))); + + if (strcmp(prev_lsn_str, "invalid") == 0) + { + /* No prev LSN. Forbid starting up in read-write mode */ + zenithLastRec = InvalidXLogRecPtr; + zenithWriteOk = false; + } + else if (strcmp(prev_lsn_str, "none") == 0) + { + /* + * The page server had no valid prev LSN, but assured that it's ok + * to start without it. This happens when you start the compute + * node for the first time on a new branch. + */ + zenithLastRec = InvalidXLogRecPtr; + zenithWriteOk = true; + } + else + { + uint32 hi, + lo; + + if (sscanf(prev_lsn_str, "%X/%X", &hi, &lo) != 2) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE))); + zenithLastRec = ((uint64) hi) << 32 | lo; + + /* If prev LSN is given, it better be valid */ + if (zenithLastRec == InvalidXLogRecPtr) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid prev-LSN in file \"%s\"", ZENITH_SIGNAL_FILE))); + zenithWriteOk = true; + } + ZenithRecoveryRequested = true; + close(fd); + + elog(LOG, + "[ZENITH] found 'zenith.signal' file. setting prev LSN to %X/%X", + LSN_FORMAT_ARGS(zenithLastRec)); + } +} + /* * This must be called ONCE during postmaster or standalone-backend startup */ @@ -4910,10 +5012,15 @@ StartupXLOG(void) CurrentResourceOwner == AuxProcessResourceOwner); CurrentResourceOwner = AuxProcessResourceOwner; + /* + * Read zenith.signal before anything else. + */ + readZenithSignalFile(); + /* * Check that contents look valid. */ - if (!XRecOffIsValid(ControlFile->checkPoint)) + if (!XRecOffIsValid(ControlFile->checkPoint) && !ZenithRecoveryRequested) ereport(FATAL, (errmsg("control file contains invalid checkpoint location"))); @@ -5510,6 +5617,7 @@ StartupXLOG(void) XLogCtl->LogwrtRqst.Write = EndOfLog; XLogCtl->LogwrtRqst.Flush = EndOfLog; + XLogCtl->lastWrittenPageLSN = EndOfLog; /* * Preallocate additional log files, if wanted. @@ -5936,6 +6044,71 @@ GetInsertRecPtr(void) return recptr; } +/* + * GetLastWrittenPageLSN -- Returns maximal LSN of written page + */ +XLogRecPtr +GetLastWrittenPageLSN(void) +{ + XLogRecPtr lsn; + SpinLockAcquire(&XLogCtl->info_lck); + lsn = XLogCtl->lastWrittenPageLSN; + SpinLockRelease(&XLogCtl->info_lck); + + return lsn; +} + +/* + * SetLastWrittenPageLSN -- Set maximal LSN of written page + */ +void +SetLastWrittenPageLSN(XLogRecPtr lsn) +{ + SpinLockAcquire(&XLogCtl->info_lck); + if (lsn > XLogCtl->lastWrittenPageLSN) + XLogCtl->lastWrittenPageLSN = lsn; + SpinLockRelease(&XLogCtl->info_lck); +} + +void +SetRedoStartLsn(XLogRecPtr RedoStartLSN) +{ + XLogCtl->RedoStartLSN = RedoStartLSN; +} + +/* + * RedoStartLsn is set only once by startup process, locking is not required + * after its exit. + */ +XLogRecPtr +GetRedoStartLsn(void) +{ + return XLogCtl->RedoStartLSN; +} + + +uint64 +GetZenithCurrentClusterSize(void) +{ + uint64 size; + SpinLockAcquire(&XLogCtl->info_lck); + size = XLogCtl->zenithCurrentClusterSize; + SpinLockRelease(&XLogCtl->info_lck); + + return size; +} + + +void +SetZenithCurrentClusterSize(uint64 size) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->zenithCurrentClusterSize = size; + SpinLockRelease(&XLogCtl->info_lck); +} + + + /* * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL * position known to be fsync'd to disk. This should only be used on a @@ -5944,8 +6117,6 @@ GetInsertRecPtr(void) XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI) { - Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE); - SpinLockAcquire(&XLogCtl->info_lck); LogwrtResult = XLogCtl->LogwrtResult; SpinLockRelease(&XLogCtl->info_lck); @@ -7800,6 +7971,8 @@ xlog_redo(XLogReaderState *record) for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) { Buffer buffer; + XLogRedoAction result; + if (!XLogRecHasBlockImage(record, block_id)) { @@ -7807,10 +7980,19 @@ xlog_redo(XLogReaderState *record) elog(ERROR, "XLOG_FPI record did not contain a full-page image"); continue; } - - if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) + result = XLogReadBufferForRedo(record, block_id, &buffer); + if (result == BLK_DONE && !IsUnderPostmaster) + { + /* + * In the special WAL process, blocks that are being ignored + * return BLK_DONE. Accept that. + */ + } + else if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); - UnlockReleaseBuffer(buffer); + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); } } else if (info == XLOG_BACKUP_END) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 35cc0559f9d..fd36e0c5941 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -37,9 +37,11 @@ #include "miscadmin.h" #include "pg_trace.h" #include "replication/origin.h" +#include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" +#include "utils/wait_event.h" /* * Guess the maximum buffer size required to store a compressed version of @@ -87,6 +89,11 @@ typedef struct char compressed_page[COMPRESS_BUFSIZE]; } registered_buffer; +/* GUCs */ +int max_replication_apply_lag; +int max_replication_flush_lag; +int max_replication_write_lag; + static registered_buffer *registered_buffers; static int max_registered_buffers; /* allocated size */ static int max_registered_block_id = 0; /* highest block_id + 1 currently @@ -142,6 +149,9 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, static bool XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length, char *dest, uint16 *dlen); +/* Timeout in milliseconds for delaying WAL inserts to avoid WAL overflow */ +#define MB ((XLogRecPtr)1024*1024) + /* * Begin constructing a WAL record. This must be called before the * XLogRegister* functions and XLogInsert(). @@ -470,6 +480,11 @@ XLogInsert(RmgrId rmid, uint8 info) return EndPos; } + if (backpressure_lag() > 0) + { + InterruptPending = true; + } + do { XLogRecPtr RedoRecPtr; diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index b98b3192cf5..96404405b93 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -721,7 +721,8 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * same relation (with some scheme to handle invalidations * safely), but for now we'll call smgropen() every time. */ - reln = smgropen(block->rnode, InvalidBackendId); + //FIXME what relpersistence should we use here? + reln = smgropen(block->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * If the relation file doesn't exist on disk, for example because diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 03fad82bc1b..9f95e3c1e1c 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -263,7 +263,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) { - Assert(!XLogRecPtrIsInvalid(RecPtr)); + Assert(!XLogRecPtrIsInvalid(RecPtr) || state->skip_lsn_checks); ResetDecoder(state); @@ -287,6 +287,14 @@ XLogReleasePreviousRecord(XLogReaderState *state) if (!state->record) return InvalidXLogRecPtr; +#define SKIP_INVALID_RECORD(rec_ptr) do { \ + rec_ptr = MAXALIGN(rec_ptr + 1); \ + if (rec_ptr % XLOG_BLCKSZ <= MAXALIGN(1)) \ + goto restart; \ + else \ + goto skip_invalid; \ + } while (0); + /* * Remove it from the decoded record queue. It must be the oldest item * decoded, decode_queue_head. @@ -583,7 +591,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) * In this case, NextRecPtr should already be pointing to a valid * record starting position. */ - Assert(XRecOffIsValid(RecPtr)); + Assert(XRecOffIsValid(RecPtr) || state->skip_lsn_checks); randAccess = true; } @@ -622,17 +630,23 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) } else if (targetRecOff < pageHeaderSize) { - report_invalid_record(state, "invalid record offset at %X/%X", + if(!state->skip_page_validation) + { + report_invalid_record(state, "invalid record offset at %X/%X", LSN_FORMAT_ARGS(RecPtr)); - goto err; + goto err; + } } if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { - report_invalid_record(state, "contrecord is requested by %X/%X", + if(!state->skip_page_validation) + { + report_invalid_record(state, "contrecord is requested by %X/%X", LSN_FORMAT_ARGS(RecPtr)); - goto err; + goto err; + } } /* ReadPageInternal has verified the page header */ @@ -647,6 +661,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) * cannot access any other fields until we've verified that we got the * whole header. */ +skip_invalid: record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); total_len = record->xl_tot_len; @@ -662,7 +677,13 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) { if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, randAccess)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } + gotheader = true; } else @@ -670,12 +691,19 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* XXX: more validation should be done here */ if (total_len < SizeOfXLogRecord) { - report_invalid_record(state, - "invalid record length at %X/%X: wanted %u, got %u", - LSN_FORMAT_ARGS(RecPtr), - (uint32) SizeOfXLogRecord, total_len); - goto err; + if(!state->skip_invalid_records) + { + report_invalid_record(state, + "invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, total_len); + + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } + gotheader = false; } @@ -721,10 +749,16 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (total_len > state->readRecordBufSize && !allocate_recordbuf(state, total_len)) { - /* We treat this as a "bogus data" condition */ - report_invalid_record(state, "record length %u at %X/%X too long", - total_len, LSN_FORMAT_ARGS(RecPtr)); - goto err; + + if(!state->skip_invalid_records) + { + /* We treat this as a "bogus data" condition */ + report_invalid_record(state, "record length %u at %X/%X too long", + total_len, LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* Copy the first fragment of the record from the first page. */ @@ -770,10 +804,15 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Check that the continuation on next page looks valid */ if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { - report_invalid_record(state, + if(!state->skip_invalid_records) + { + report_invalid_record(state, "there is no contrecord flag at %X/%X", LSN_FORMAT_ARGS(RecPtr)); - goto err; + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* @@ -783,12 +822,17 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (pageHeader->xlp_rem_len == 0 || total_len != (pageHeader->xlp_rem_len + gotlen)) { - report_invalid_record(state, + if(!state->skip_invalid_records) + { + report_invalid_record(state, "invalid contrecord length %u (expected %lld) at %X/%X", pageHeader->xlp_rem_len, ((long long) total_len) - gotlen, LSN_FORMAT_ARGS(RecPtr)); - goto err; + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* Append the continuation from this page to the buffer */ @@ -819,7 +863,13 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) record = (XLogRecord *) state->readRecordBuf; if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, randAccess)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } + gotheader = true; } } while (gotlen < total_len); @@ -828,7 +878,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) record = (XLogRecord *) state->readRecordBuf; if (!ValidXLogRecord(state, record, RecPtr)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); state->DecodeRecPtr = RecPtr; @@ -847,7 +902,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Record does not cross a page boundary */ if (!ValidXLogRecord(state, record, RecPtr)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } state->NextRecPtr = RecPtr + MAXALIGN(total_len); @@ -1031,8 +1091,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) /* we can be sure to have enough WAL available, we scrolled back */ Assert(readLen == XLOG_BLCKSZ); - if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, - state->readBuf)) + if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, state->readBuf) && !state->skip_page_validation) goto err; } @@ -1073,7 +1132,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) /* * Now that we know we have the full header, validate it. */ - if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) + if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr) && !state->skip_page_validation) goto err; /* update read state information */ @@ -1132,7 +1191,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * We can't exactly verify the prev-link, but surely it should be less * than the record's own address. */ - if (!(record->xl_prev < RecPtr)) + if (!(record->xl_prev < RecPtr) && !state->skip_lsn_checks) { report_invalid_record(state, "record with incorrect prev-link %X/%X at %X/%X", @@ -1148,7 +1207,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * check guards against torn WAL pages where a stale but valid-looking * WAL record starts on a sector boundary. */ - if (record->xl_prev != PrevRecPtr) + if (record->xl_prev != PrevRecPtr && !state->skip_lsn_checks) { report_invalid_record(state, "record with incorrect prev-link %X/%X at %X/%X", @@ -1291,7 +1350,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, * check typically fails when an old WAL segment is recycled, and hasn't * yet been overwritten with new data yet. */ - if (hdr->xlp_pageaddr != recaddr) + if (hdr->xlp_pageaddr != recaddr && !state->skip_lsn_checks) { char fname[MAXFNAMELEN]; @@ -1783,6 +1842,7 @@ DecodeXLogRecord(XLogReaderState *state, } else blk->hole_length = BLCKSZ - blk->bimg_len; + datatotal += blk->bimg_len; /* @@ -1928,6 +1988,7 @@ DecodeXLogRecord(XLogReaderState *state, /* Report the actual size we used. */ decoded->size = MAXALIGN(out - (char *) decoded); + Assert(DecodeXLogRecordRequiredSpace(record->xl_tot_len) >= decoded->size); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 166f7b7b793..0708a9e4c68 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -562,6 +562,9 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) ereport(LOG, (errmsg("starting point-in-time recovery to earliest consistent point"))); + else if (ZenithRecoveryRequested) + ereport(LOG, + (errmsg("starting zenith recovery"))); else ereport(LOG, (errmsg("starting archive recovery"))); @@ -702,6 +705,33 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, /* tell the caller to delete it later */ haveBackupLabel = true; } + else if (ZenithRecoveryRequested) + { + /* + * Zenith hacks to spawn compute node without WAL. Pretend that we + * just finished reading the record that started at 'zenithLastRec' + * and ended at checkpoint.redo + */ + elog(LOG, "starting with zenith basebackup at LSN %X/%X, prev %X/%X", + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + LSN_FORMAT_ARGS(zenithLastRec)); + + CheckPointLoc = zenithLastRec; + CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; + RedoStartLSN = ControlFile->checkPointCopy.redo; + // FIXME needs review. rebase of ff41b709abea6a9c42100a4fcb0ff434b2c846c9 + // Is it still relevant? + /* make basebackup LSN available for walproposer */ + SetRedoStartLsn(RedoStartLSN); + //EndRecPtr = ControlFile->checkPointCopy.redo; + + memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint)); + wasShutdown = true; + + /* Initialize expectedTLEs, like ReadRecord() does */ + expectedTLEs = readTimeLineHistory(checkPoint.ThisTimeLineID); + XLogPrefetcherBeginRead(xlogprefetcher, ControlFile->checkPointCopy.redo); + } else { /* @@ -763,6 +793,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, CheckPointLoc = ControlFile->checkPoint; CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; RedoStartLSN = ControlFile->checkPointCopy.redo; + SetRedoStartLsn(RedoStartLSN); RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID; record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 1, true, CheckPointTLI); @@ -852,7 +883,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, (errmsg("invalid next transaction ID"))); /* sanity check */ - if (checkPoint.redo > CheckPointLoc) + if (checkPoint.redo > CheckPointLoc && !ZenithRecoveryRequested) ereport(PANIC, (errmsg("invalid redo in checkpoint record"))); @@ -1440,6 +1471,8 @@ FinishWalRecovery(void) * An important side-effect of this is to load the last page into * xlogreader. The caller uses it to initialize the WAL for writing. */ + + if (!InRecovery) { lastRec = CheckPointLoc; @@ -1450,8 +1483,12 @@ FinishWalRecovery(void) lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr; lastRecTLI = XLogRecoveryCtl->lastReplayedTLI; } - XLogPrefetcherBeginRead(xlogprefetcher, lastRec); - (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + + if (!ZenithRecoveryRequested) + { + XLogPrefetcherBeginRead(xlogprefetcher, lastRec); + (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + } endOfLog = xlogreader->EndRecPtr; /* @@ -1489,7 +1526,45 @@ FinishWalRecovery(void) * Copy the last partial block to the caller, for initializing the WAL * buffer for appending new WAL. */ - if (endOfLog % XLOG_BLCKSZ != 0) + /* + * When starting from a zenith base backup, we don't have WAL. Initialize + * the WAL page where we will start writing new records from scratch, + * instead. + */ + if (ZenithRecoveryRequested) + { + if (!zenithWriteOk) + { + /* + * We cannot start generating new WAL if we don't have a valid prev-LSN + * to use for the first new WAL record. (Shouldn't happen.) + */ + ereport(ERROR, + (errmsg("cannot start in read-write mode from this base backup"))); + } + else + { + int len = endOfLog % XLOG_BLCKSZ; + char *page = palloc0(len); + XLogRecPtr pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ); + + XLogPageHeader xlogPageHdr = (XLogPageHeader) (page); + + xlogPageHdr->xlp_pageaddr = pageBeginPtr; + xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC; + xlogPageHdr->xlp_tli = recoveryTargetTLI; + xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD; // FIXME + xlogPageHdr->xlp_rem_len = (endOfLog % XLOG_BLCKSZ) - SizeOfXLogShortPHD; + readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size); + + result->lastPageBeginPtr = pageBeginPtr; + result->lastPage = page; + elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); + + // FIXME: should we unlink zenith.signal? + } + } + else if (endOfLog % XLOG_BLCKSZ != 0) { char *page; int len; @@ -1541,7 +1616,10 @@ ShutdownWalRecovery(void) char recoveryPath[MAXPGPATH]; /* Final update of pg_stat_recovery_prefetch. */ - XLogPrefetcherComputeStats(xlogprefetcher); + if (!ZenithRecoveryRequested) + { + XLogPrefetcherComputeStats(xlogprefetcher); + } /* Shut down xlogreader */ if (readFile >= 0) @@ -1550,7 +1628,11 @@ ShutdownWalRecovery(void) readFile = -1; } XLogReaderFree(xlogreader); - XLogPrefetcherFree(xlogprefetcher); + + if (!ZenithRecoveryRequested) + { + XLogPrefetcherFree(xlogprefetcher); + } if (ArchiveRecoveryRequested) { diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 702c8c14e12..53ac78b4f69 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -33,6 +33,8 @@ #include "utils/rel.h" +bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + /* GUC variable */ bool ignore_invalid_pages = false; @@ -372,6 +374,22 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, block_id); } + if (redo_read_buffer_filter && redo_read_buffer_filter(record, block_id)) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + { + //FIXME assume relpersistence permanent. Is it always true? + *buf = ReadBufferWithoutRelcache(rnode, forknum, + blkno, mode, NULL, true); + return BLK_DONE; + } + else + { + *buf = InvalidBuffer; + return BLK_DONE; + } + } + /* * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. @@ -490,7 +508,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, } /* Open the relation at smgr level */ - smgr = smgropen(rnode, InvalidBackendId); + smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * Create the target file if it doesn't already exist. This lets us cope diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 48ff9483af2..149b2f408aa 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -47,6 +47,7 @@ uint32 bootstrap_data_checksum_version = 0; /* No checksum */ +extern uint64 predefined_sysidentifier; static void CheckerModeMain(void); static void bootstrap_signals(void); @@ -221,7 +222,7 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) argv++; argc--; - while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:X:-:")) != -1) + while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:s:X:-:")) != -1) { switch (flag) { @@ -265,6 +266,16 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) PGC_S_DYNAMIC_DEFAULT); } break; + case 's': + { + char* endptr; +#ifdef HAVE_STRTOULL + predefined_sysidentifier = strtoull(optarg, &endptr, 10); +#else + predefined_sysidentifier = strtoul(optarg, &endptr, 10); +#endif + break; + } case 'c': case '-': { diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index c06e414a38f..d73854cdd44 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -145,7 +145,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, return NULL; /* placate compiler */ } - srel = smgropen(rnode, backend); + srel = smgropen(rnode, backend, relpersistence); smgrcreate(srel, MAIN_FORKNUM, false); if (needs_wal) @@ -677,7 +677,7 @@ smgrDoPendingDeletes(bool isCommit) { SMgrRelation srel; - srel = smgropen(pending->relnode, pending->backend); + srel = smgropen(pending->relnode, pending->backend, 0); /* allocate the initial array, or extend it, if needed */ if (maxrels == 0) @@ -758,7 +758,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) BlockNumber total_blocks = 0; SMgrRelation srel; - srel = smgropen(pendingsync->rnode, InvalidBackendId); + srel = smgropen(pendingsync->rnode, InvalidBackendId, 0); /* * We emit newpage WAL records for smaller relations. @@ -967,7 +967,7 @@ smgr_redo(XLogReaderState *record) xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; - reln = smgropen(xlrec->rnode, InvalidBackendId); + reln = smgropen(xlrec->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) @@ -980,7 +980,7 @@ smgr_redo(XLogReaderState *record) int nforks = 0; bool need_fsm_vacuum = false; - reln = smgropen(xlrec->rnode, InvalidBackendId); + reln = smgropen(xlrec->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * Forcibly create relation if it doesn't exist (which suggests that diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index acaf660c68e..88e0e3a270a 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -54,7 +54,9 @@ * so we pre-log a few fetches in advance. In the event of * crash we can lose (skip over) as many values as we pre-logged. */ -#define SEQ_LOG_VALS 32 +/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ +/* #define SEQ_LOG_VALS 32 */ +#define SEQ_LOG_VALS 0 /* * The "special area" of a sequence's buffer page looks like this. @@ -355,7 +357,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) { SMgrRelation srel; - srel = smgropen(rel->rd_node, InvalidBackendId); + srel = smgropen(rel->rd_node, InvalidBackendId, rel->rd_rel->relpersistence); smgrcreate(srel, INIT_FORKNUM, false); log_smgrcreate(&rel->rd_node, INIT_FORKNUM); fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 88b253dbcd6..6f306a5cdce 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14760,7 +14760,7 @@ index_copy_data(Relation rel, RelFileNode newrnode) { SMgrRelation dstrel; - dstrel = smgropen(newrnode, rel->rd_backend); + dstrel = smgropen(newrnode, rel->rd_backend, rel->rd_rel->relpersistence); /* * Since we copy the file directly without looking at the shared buffers, diff --git a/src/backend/main/main.c b/src/backend/main/main.c index f8f7ebbd445..a94f7614c00 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -39,6 +39,7 @@ #include "common/username.h" #include "port/atomics.h" #include "postmaster/postmaster.h" +#include "replication/walproposer.h" #include "storage/spin.h" #include "tcop/tcopprot.h" #include "utils/help_config.h" @@ -198,6 +199,12 @@ main(int argc, char *argv[]) else if (argc > 1 && strcmp(argv[1], "--single") == 0) PostgresSingleUserMain(argc, argv, strdup(get_user_name_or_exit(progname))); + else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0) + WalRedoMain(argc, argv, + NULL, /* no dbname */ + strdup(get_user_name_or_exit(progname))); /* does not return */ + else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0) + WalProposerSync(argc, argv); else PostmasterMain(argc, argv); /* the functions above should not return */ diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index ea287c733df..43a13e1d652 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -156,6 +156,21 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) } #endif + /* + * NEON: do not create shared memory segments for single user wal redo + * postgres. Many spawned instances of wal redo may exhaust kernel.shmmni + */ + if (am_wal_redo_postgres) + { + void *ptr = malloc(size); + + if (ptr == NULL) + { + ereport(FATAL, + (errmsg("could not create shared memory segment with size %zu for WAL redo process", size))); + } + return ptr; + } shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); if (shmid < 0) diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index 3a794e54d60..a4f49350690 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -27,4 +27,9 @@ OBJS = \ syslogger.o \ walwriter.o +ifeq ($(with_libseccomp),yes) +OBJS += \ + seccomp.o +endif + include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 8dd7d64630c..6a644a27534 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -22,6 +22,7 @@ #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" +#include "replication/walproposer.h" #include "storage/dsm.h" #include "storage/ipc.h" #include "storage/latch.h" @@ -128,6 +129,9 @@ static const struct }, { "ApplyWorkerMain", ApplyWorkerMain + }, + { + "WalProposerMain", WalProposerMain } }; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 892d42c63ee..b00ee2184a5 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -119,6 +119,7 @@ #include "postmaster/syslogger.h" #include "replication/logicallauncher.h" #include "replication/walsender.h" +#include "replication/walproposer.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" @@ -1020,6 +1021,11 @@ PostmasterMain(int argc, char *argv[]) */ ApplyLauncherRegister(); + /* + * Start WAL proposer bgworker is wal acceptors list is not empty + */ + WalProposerRegister(); + /* * process any libraries that should be preloaded at postmaster start */ diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c new file mode 100644 index 00000000000..3ac21b02983 --- /dev/null +++ b/src/backend/postmaster/seccomp.c @@ -0,0 +1,249 @@ +/*------------------------------------------------------------------------- + * + * seccomp.c + * Secure Computing BPF API wrapper. + * + * Pageserver delegates complex WAL decoding duties to postgres, + * which means that the latter might fall victim to carefully designed + * malicious WAL records and start doing harmful things to the system. + * To prevent this, it has been decided to limit possible interactions + * with the outside world using the Secure Computing BPF mode. + * + * We use this mode to disable all syscalls not in the allowlist. This + * approach has its pros & cons: + * + * - We have to carefully handpick and maintain the set of syscalls + * required for the WAL redo process. Core dumps help with that. + * The method of trial and error seems to work reasonably well, + * but it would be nice to find a proper way to "prove" that + * the set in question is both necessary and sufficient. + * + * - Once we enter the seccomp bpf mode, it's impossible to lift those + * restrictions (otherwise, what kind of "protection" would that be?). + * Thus, we have to either enable extra syscalls for the clean shutdown, + * or exit the process immediately via _exit() instead of proc_exit(). + * + * - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom + * facility to deal with the forbidden syscalls? If we'd like to embed + * a startup security test, we should go with the latter; In that + * case, which one of the following options is preferable? + * + * * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP. + * Provide a common signal handler with a static switch to override + * its behavior for the test case. This would undermine the whole + * purpose of such protection, so we'd have to go further and remap + * the memory backing the switch as readonly, then ban mprotect(). + * Ugly and fragile, to say the least. + * + * * Yet again, catch the denied syscalls using SCMP_ACT_TRAP. + * Provide 2 different signal handlers: one for a test case, + * another for the main processing loop. Install the first one, + * enable seccomp, perform the test, switch to the second one, + * finally ban sigaction(), presto! + * + * * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the + * test, then ban it altogether with another filter. The downside + * of this solution is that we don't actually check that + * SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works. + * + * Either approach seems to require two eBPF filter programs, + * which is unfortunate: the man page tells this is uncommon. + * Maybe I (@funbringer) am missing something, though; I encourage + * any reader to get familiar with it and scrutinize my conclusions. + * + * TODOs and ideas in no particular order: + * + * - Do something about mmap() in musl's malloc(). + * Definitely not a priority if we don't care about musl. + * + * - See if we can untangle PG's shutdown sequence (involving unlink()): + * + * * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode. + * * Investigate chroot() or mount namespaces for better FS isolation. + * * (Per Heikki) Simply call _exit(), no big deal. + * * Come up with a better idea? + * + * - Make use of seccomp's argument inspection (for what?). + * Unfortunately, it views all syscall arguments as scalars, + * so it won't work for e.g. string comparison in unlink(). + * + * - Benchmark with bpf jit on/off, try seccomp_syscall_priority(). + * + * - Test against various linux distros & glibc versions. + * I suspect that certain libc functions might involve slightly + * different syscalls, e.g. select/pselect6/pselect6_time64/whatever. + * + * - Test on any arch other than amd64 to see if it works there. + * + * + * IDENTIFICATION + * src/backend/postmaster/seccomp.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "miscadmin.h" +#include "postmaster/seccomp.h" + +#include +#include + +static void die(int code, const char *str); + +static bool seccomp_test_sighandler_done = false; +static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt); +static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt); + +static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action); + +void seccomp_load_rules(PgSeccompRule *rules, int count) +{ + struct sigaction action = { .sa_flags = SA_SIGINFO }; + PgSeccompRule rule; + long fd; + + /* + * Install a test signal handler. + * XXX: pqsignal() is too restrictive for our purposes, + * since we'd like to examine the contents of siginfo_t. + */ + action.sa_sigaction = seccomp_test_sighandler; + if (sigaction(SIGSYS, &action, NULL) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not install test SIGSYS handler"))); + + /* + * First, check that open of a well-known file works. + * XXX: We use raw syscall() to call the very open(). + */ + fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + if (seccomp_test_sighandler_done) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: signal handler test flag was set unexpectedly"))); + if (fd < 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not open /dev/null for seccomp testing: %m"))); + close((int) fd); + + /* Set a trap on open() to test seccomp bpf */ + rule = PG_SCMP(open, SCMP_ACT_TRAP); + if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not load test trap"))); + + /* Finally, check that open() now raises SIGSYS */ + (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + if (!seccomp_test_sighandler_done) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: SIGSYS handler doesn't seem to work"))); + + /* Now that everything seems to work, install a proper handler */ + action.sa_sigaction = seccomp_deny_sighandler; + if (sigaction(SIGSYS, &action, NULL) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not install SIGSYS handler"))); + + /* If this succeeds, any syscall not in the list will crash the process */ + if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("seccomp: could not enter seccomp mode"))); +} + +/* + * Enter seccomp mode with a BPF filter that will only allow + * certain syscalls to proceed. + */ +static int +do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action) +{ + scmp_filter_ctx ctx; + int rc = -1; + + /* Create a context with a default action for syscalls not in the list */ + if ((ctx = seccomp_init(def_action)) == NULL) + goto cleanup; + + for (int i = 0; i < count; i++) + { + PgSeccompRule *rule = &rules[i]; + if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0) + goto cleanup; + } + + /* Try building & loading the program into the kernel */ + if ((rc = seccomp_load(ctx)) != 0) + goto cleanup; + +cleanup: + /* + * We don't need the context anymore regardless of the result, + * since either we failed or the eBPF program has already been + * loaded into the linux kernel. + */ + seccomp_release(ctx); + return rc; +} + +static void +die(int code, const char *str) +{ + /* work around gcc ignoring that it shouldn't warn on (void) result being unused */ + ssize_t _unused pg_attribute_unused(); + /* Best effort write to stderr */ + _unused = write(fileno(stderr), str, strlen(str)); + + /* XXX: we don't want to run any atexit callbacks */ + _exit(code); +} + +static void +seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) +{ +#define DIE_PREFIX "seccomp test signal handler: " + + /* Check that this signal handler is used only for a single test case */ + if (seccomp_test_sighandler_done) + die(1, DIE_PREFIX "test handler should only be used for 1 test\n"); + seccomp_test_sighandler_done = true; + + if (signum != SIGSYS) + die(1, DIE_PREFIX "bad signal number\n"); + + /* TODO: maybe somehow extract the hardcoded syscall number */ + if (info->si_syscall != SCMP_SYS(open)) + die(1, DIE_PREFIX "bad syscall number\n"); + +#undef DIE_PREFIX +} + +static void +seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) +{ + /* + * Unfortunately, we can't use seccomp_syscall_resolve_num_arch() + * to resolve the syscall's name, since it calls strdup() + * under the hood (wtf!). + */ + char buffer[128]; + (void)snprintf(buffer, lengthof(buffer), + "---------------------------------------\n" + "seccomp: bad syscall %d\n" + "---------------------------------------\n", + info->si_syscall); + + /* + * Instead of silently crashing the process with + * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS, + * we'd like to receive a real SIGSYS to print the + * message and *then* immediately exit. + */ + die(1, buffer); +} diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile index 2bffac58c0d..d35cb2114ca 100644 --- a/src/backend/replication/Makefile +++ b/src/backend/replication/Makefile @@ -22,7 +22,9 @@ OBJS = \ syncrep_gram.o \ walreceiver.o \ walreceiverfuncs.o \ - walsender.o + walsender.o \ + walproposer.o \ + walproposer_utils.o SUBDIRS = logical diff --git a/src/backend/replication/libpqwalproposer/Makefile b/src/backend/replication/libpqwalproposer/Makefile new file mode 100644 index 00000000000..c570160536f --- /dev/null +++ b/src/backend/replication/libpqwalproposer/Makefile @@ -0,0 +1,37 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/replication/libpqwalproposer +# +# IDENTIFICATION +# src/backend/replication/libpqwalproposer/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/replication/libpqwalproposer +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS) + +OBJS = \ + $(WIN32RES) \ + libpqwalproposer.o +SHLIB_LINK_INTERNAL = $(libpq) +SHLIB_LINK = $(filter -lintl, $(LIBS)) +SHLIB_PREREQS = submake-libpq +PGFILEDESC = "libpqwalproposer - libpq interface for WAL proposer" +NAME = libpqwalproposer + +all: all-shared-lib + +include $(top_srcdir)/src/Makefile.shlib + +install: all installdirs install-lib + +installdirs: installdirs-lib + +uninstall: uninstall-lib + +clean distclean maintainer-clean: clean-lib + rm -f $(OBJS) diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c new file mode 100644 index 00000000000..a12a2ee04bc --- /dev/null +++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c @@ -0,0 +1,416 @@ +#include "postgres.h" + +#include "replication/walproposer.h" +#include "libpq-fe.h" + +/* Required for anything that's dynamically loaded */ +PG_MODULE_MAGIC; +void _PG_init(void); + +/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ +struct WalProposerConn +{ + PGconn* pg_conn; + bool is_nonblocking; /* whether the connection is non-blocking */ + char *recvbuf; /* last received data from libpqprop_async_read */ +}; + +/* Prototypes for exported functions */ +static char* libpqprop_error_message(WalProposerConn* conn); +static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn); +static WalProposerConn* libpqprop_connect_start(char* conninfo); +static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn); +static bool libpqprop_send_query(WalProposerConn* conn, char* query); +static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn); +static pgsocket libpqprop_socket(WalProposerConn* conn); +static int libpqprop_flush(WalProposerConn* conn); +static void libpqprop_finish(WalProposerConn* conn); +static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount); +static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size); +static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size); + +static WalProposerFunctionsType PQWalProposerFunctions = { + libpqprop_error_message, + libpqprop_status, + libpqprop_connect_start, + libpqprop_connect_poll, + libpqprop_send_query, + libpqprop_get_query_result, + libpqprop_socket, + libpqprop_flush, + libpqprop_finish, + libpqprop_async_read, + libpqprop_async_write, + libpqprop_blocking_write, +}; + +/* Module initialization */ +void +_PG_init(void) +{ + if (WalProposerFunctions != NULL) + elog(ERROR, "libpqwalproposer already loaded"); + WalProposerFunctions = &PQWalProposerFunctions; +} + +/* Helper function */ +static bool +ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) +{ + /* If we're already correctly blocking or nonblocking, all good */ + if (is_nonblocking == conn->is_nonblocking) + return true; + + /* Otherwise, set it appropriately */ + if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) + return false; + + conn->is_nonblocking = is_nonblocking; + return true; +} + +/* Exported function definitions */ +static char* +libpqprop_error_message(WalProposerConn* conn) +{ + return PQerrorMessage(conn->pg_conn); +} + +static WalProposerConnStatusType +libpqprop_status(WalProposerConn* conn) +{ + switch (PQstatus(conn->pg_conn)) + { + case CONNECTION_OK: + return WP_CONNECTION_OK; + case CONNECTION_BAD: + return WP_CONNECTION_BAD; + default: + return WP_CONNECTION_IN_PROGRESS; + } +} + +static WalProposerConn* +libpqprop_connect_start(char* conninfo) +{ + WalProposerConn* conn; + PGconn* pg_conn; + + pg_conn = PQconnectStart(conninfo); + /* + * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the + * behavior of PQconnectStart here. + */ + if (!pg_conn) + return NULL; + + /* + * And in theory this allocation can fail as well, but it's incredibly unlikely if we just + * successfully allocated a PGconn. + * + * palloc will exit on failure though, so there's not much we could do if it *did* fail. + */ + conn = palloc(sizeof(WalProposerConn)); + conn->pg_conn = pg_conn; + conn->is_nonblocking = false; /* connections always start in blocking mode */ + conn->recvbuf = NULL; + return conn; +} + +static WalProposerConnectPollStatusType +libpqprop_connect_poll(WalProposerConn* conn) +{ + WalProposerConnectPollStatusType return_val; + + switch (PQconnectPoll(conn->pg_conn)) + { + case PGRES_POLLING_FAILED: + return_val = WP_CONN_POLLING_FAILED; + break; + case PGRES_POLLING_READING: + return_val = WP_CONN_POLLING_READING; + break; + case PGRES_POLLING_WRITING: + return_val = WP_CONN_POLLING_WRITING; + break; + case PGRES_POLLING_OK: + return_val = WP_CONN_POLLING_OK; + break; + + /* There's a comment at its source about this constant being unused. We'll expect it's never + * returned. */ + case PGRES_POLLING_ACTIVE: + elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); + /* This return is never actually reached, but it's here to make the compiler happy */ + return WP_CONN_POLLING_FAILED; + + default: + Assert(false); + return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ + } + + return return_val; +} + +static bool +libpqprop_send_query(WalProposerConn* conn, char* query) +{ + /* We need to be in blocking mode for sending the query to run without + * requiring a call to PQflush */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* PQsendQuery returns 1 on success, 0 on failure */ + if (!PQsendQuery(conn->pg_conn, query)) + return false; + + return true; +} + +static WalProposerExecStatusType +libpqprop_get_query_result(WalProposerConn* conn) +{ + PGresult* result; + WalProposerExecStatusType return_val; + + /* Marker variable if we need to log an unexpected success result */ + char* unexpected_success = NULL; + + /* Consume any input that we might be missing */ + if (!PQconsumeInput(conn->pg_conn)) + return WP_EXEC_FAILED; + + if (PQisBusy(conn->pg_conn)) + return WP_EXEC_NEEDS_INPUT; + + + result = PQgetResult(conn->pg_conn); + /* PQgetResult returns NULL only if getting the result was successful & there's no more of the + * result to get. */ + if (!result) + { + elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); + return WP_EXEC_UNEXPECTED_SUCCESS; + } + + /* Helper macro to reduce boilerplate */ + #define UNEXPECTED_SUCCESS(msg) \ + return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ + unexpected_success = msg; \ + break; + + + switch (PQresultStatus(result)) + { + /* "true" success case */ + case PGRES_COPY_BOTH: + return_val = WP_EXEC_SUCCESS_COPYBOTH; + break; + + /* Unexpected success case */ + case PGRES_EMPTY_QUERY: + UNEXPECTED_SUCCESS("empty query return"); + case PGRES_COMMAND_OK: + UNEXPECTED_SUCCESS("data-less command end"); + case PGRES_TUPLES_OK: + UNEXPECTED_SUCCESS("tuples return"); + case PGRES_COPY_OUT: + UNEXPECTED_SUCCESS("'Copy Out' response"); + case PGRES_COPY_IN: + UNEXPECTED_SUCCESS("'Copy In' response"); + case PGRES_SINGLE_TUPLE: + UNEXPECTED_SUCCESS("single tuple return"); + case PGRES_PIPELINE_SYNC: + UNEXPECTED_SUCCESS("pipeline sync point"); + + /* Failure cases */ + case PGRES_BAD_RESPONSE: + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_PIPELINE_ABORTED: + return_val = WP_EXEC_FAILED; + break; + + default: + Assert(false); + return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ + } + + if (unexpected_success) + elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); + + return return_val; +} + +static pgsocket +libpqprop_socket(WalProposerConn* conn) +{ + return PQsocket(conn->pg_conn); +} + +static int +libpqprop_flush(WalProposerConn* conn) +{ + return (PQflush(conn->pg_conn)); +} + +static void +libpqprop_finish(WalProposerConn* conn) +{ + if (conn->recvbuf != NULL) + PQfreemem(conn->recvbuf); + PQfinish(conn->pg_conn); + pfree(conn); +} + +/* + * Receive a message from the safekeeper. + * + * On success, the data is placed in *buf. It is valid until the next call + * to this function. + */ +static PGAsyncReadResult +libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) +{ + int result; + + if (conn->recvbuf != NULL) + { + PQfreemem(conn->recvbuf); + conn->recvbuf = NULL; + } + + /* Call PQconsumeInput so that we have the data we need */ + if (!PQconsumeInput(conn->pg_conn)) + { + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + + /* The docs for PQgetCopyData list the return values as: + * 0 if the copy is still in progress, but no "complete row" is + * available + * -1 if the copy is done + * -2 if an error occured + * (> 0) if it was successful; that value is the amount transferred. + * + * The protocol we use between walproposer and safekeeper means that we + * *usually* wouldn't expect to see that the copy is done, but this can + * sometimes be triggered by the server returning an ErrorResponse (which + * also happens to have the effect that the copy is done). + */ + switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true)) + { + case 0: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_TRY_AGAIN; + case -1: + { + /* + * If we get -1, it's probably because of a server error; the + * safekeeper won't normally send a CopyDone message. + * + * We can check PQgetResult to make sure that the server failed; + * it'll always result in PGRES_FATAL_ERROR + */ + ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); + + if (status != PGRES_FATAL_ERROR) + elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); + + /* If there was actually an error, it'll be properly reported by + * calls to PQerrorMessage -- we don't have to do anything else */ + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + } + case -2: + *amount = 0; + *buf = NULL; + return PG_ASYNC_READ_FAIL; + default: + /* Positive values indicate the size of the returned result */ + *amount = result; + *buf = conn->recvbuf; + return PG_ASYNC_READ_SUCCESS; + } +} + +static PGAsyncWriteResult +libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) +{ + int result; + + /* If we aren't in non-blocking mode, switch to it. */ + if (!ensure_nonblocking_status(conn, true)) + return PG_ASYNC_WRITE_FAIL; + + /* The docs for PQputcopyData list the return values as: + * 1 if the data was queued, + * 0 if it was not queued because of full buffers, or + * -1 if an error occured + */ + result = PQputCopyData(conn->pg_conn, buf, size); + + /* We won't get a result of zero because walproposer always empties the + * connection's buffers before sending more */ + Assert(result != 0); + + switch (result) + { + case 1: + /* good -- continue */ + break; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQputCopyData", result); + } + + /* After queueing the data, we still need to flush to get it to send. + * This might take multiple tries, but we don't want to wait around + * until it's done. + * + * PQflush has the following returns (directly quoting the docs): + * 0 if sucessful, + * 1 if it was unable to send all the data in the send queue yet + * -1 if it failed for some reason + */ + switch (result = PQflush(conn->pg_conn)) { + case 0: + return PG_ASYNC_WRITE_SUCCESS; + case 1: + return PG_ASYNC_WRITE_TRY_FLUSH; + case -1: + return PG_ASYNC_WRITE_FAIL; + default: + elog(FATAL, "invalid return %d from PQflush", result); + } +} + +static bool +libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size) +{ + int result; + + /* If we are in non-blocking mode, switch out of it. */ + if (!ensure_nonblocking_status(conn, false)) + return false; + + /* Ths function is very similar to libpqprop_async_write. For more + * information, refer to the comments there */ + if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) + return false; + + Assert(result == 1); + + /* Because the connection is non-blocking, flushing returns 0 or -1 */ + + if ((result = PQflush(conn->pg_conn)) == -1) + return false; + + Assert(result == 0); + return true; +} diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c new file mode 100644 index 00000000000..a72f63509bf --- /dev/null +++ b/src/backend/replication/walproposer.c @@ -0,0 +1,2357 @@ +/*------------------------------------------------------------------------- + * + * walproposer.c + * + * Proposer/leader part of the total order broadcast protocol between postgres + * and WAL safekeepers. + * + * We have two ways of launching WalProposer: + * + * 1. As a background worker which will run physical WalSender with + * am_wal_proposer flag set to true. WalSender in turn would handle WAL + * reading part and call WalProposer when ready to scatter WAL. + * + * 2. As a standalone utility by running `postgres --sync-safekeepers`. That + * is needed to create LSN from which it is safe to start postgres. More + * specifically it addresses following problems: + * + * a) Chicken-or-the-egg problem: compute postgres needs data directory + * with non-rel files that are downloaded from pageserver by calling + * basebackup@LSN. This LSN is not arbitrary, it must include all + * previously committed transactions and defined through consensus + * voting, which happens... in walproposer, a part of compute node. + * + * b) Just warranting such LSN is not enough, we must also actually commit + * it and make sure there is a safekeeper who knows this LSN is + * committed so WAL before it can be streamed to pageserver -- otherwise + * basebackup will hang waiting for WAL. Advancing commit_lsn without + * playing consensus game is impossible, so speculative 'let's just poll + * safekeepers, learn start LSN of future epoch and run basebackup' + * won't work. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include "access/xlogdefs.h" +#include "access/xlogutils.h" +#include "access/xlogrecovery.h" +#include "replication/walproposer.h" +#include "storage/latch.h" +#include "storage/fd.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "access/xlog.h" +#include "libpq/pqformat.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + + +char *wal_acceptors_list; +int wal_acceptor_reconnect_timeout; +int wal_acceptor_connect_timeout; +bool am_wal_proposer; + +char *zenith_timeline_walproposer = NULL; +char *zenith_tenant_walproposer = NULL; +char *zenith_pageserver_connstring_walproposer = NULL; + +/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ +WalProposerFunctionsType *WalProposerFunctions = NULL; + +#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" + +static int n_safekeepers = 0; +static int quorum = 0; +static Safekeeper safekeeper[MAX_SAFEKEEPERS]; +static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ +static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */ +static ProposerGreeting greetRequest; +static VoteRequest voteRequest; /* Vote request for safekeeper */ +static WaitEventSet *waitEvents; +static AppendResponse quorumFeedback; +/* + * Minimal LSN which may be needed for recovery of some safekeeper, + * record-aligned (first record which might not yet received by someone). + */ +static XLogRecPtr truncateLsn; +/* + * Term of the proposer. We want our term to be highest and unique, + * so we collect terms from safekeepers quorum, choose max and +1. + * After that our term is fixed and must not change. If we observe + * that some safekeeper has higher term, it means that we have another + * running compute, so we must stop immediately. + */ +static term_t propTerm; +static TermHistory propTermHistory; /* term history of the proposer */ +static XLogRecPtr propEpochStartLsn; /* epoch start lsn of the proposer */ +static term_t donorEpoch; /* Most advanced acceptor epoch */ +static int donor; /* Most advanced acceptor */ +static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +static int n_votes = 0; +static int n_connected = 0; +static TimestampTz last_reconnect_attempt; + +/* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */ +static bool syncSafekeepers; + +static WalproposerShmemState *walprop_shared; + +/* Prototypes for private functions */ +static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId, TimeLineID tli); +static void WalProposerStart(void); +static void WalProposerLoop(void); +static void InitEventSet(void); +static void UpdateEventSet(Safekeeper *sk, uint32 events); +static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); +static void ShutdownConnection(Safekeeper *sk); +static void ResetConnection(Safekeeper *sk); +static long TimeToReconnect(TimestampTz now); +static void ReconnectSafekeepers(void); +static void AdvancePollState(Safekeeper *sk, uint32 events); +static void HandleConnectionEvent(Safekeeper *sk); +static void SendStartWALPush(Safekeeper *sk); +static void RecvStartWALPushResult(Safekeeper *sk); +static void SendProposerGreeting(Safekeeper *sk); +static void RecvAcceptorGreeting(Safekeeper *sk); +static void SendVoteRequest(Safekeeper *sk); +static void RecvVoteResponse(Safekeeper *sk); +static void HandleElectedProposer(void); +static term_t GetHighestTerm(TermHistory *th); +static term_t GetEpoch(Safekeeper *sk); +static void DetermineEpochStartLsn(void); +static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); +static void SendProposerElected(Safekeeper *sk); +static void WalProposerStartStreaming(XLogRecPtr startpos); +static void StartStreaming(Safekeeper *sk); +static void SendMessageToNode(Safekeeper *sk); +static void BroadcastAppendRequest(void); +static void HandleActiveState(Safekeeper *sk, uint32 events); +static bool SendAppendRequests(Safekeeper *sk); +static bool RecvAppendResponses(Safekeeper *sk); +static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs); +static XLogRecPtr CalculateMinFlushLsn(void); +static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); +static void HandleSafekeeperResponse(void); +static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); +static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); +static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); +static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); +static bool AsyncFlush(Safekeeper *sk); + + +/* + * WAL proposer bgworker entry point. + */ +void +WalProposerMain(Datum main_arg) +{ + TimeLineID tli; + + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + GetXLogReplayRecPtr(&tli); + + WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier(), tli); + + last_reconnect_attempt = GetCurrentTimestamp(); + + application_name = (char *) "walproposer"; /* for + * synchronous_standby_names */ + am_wal_proposer = true; + am_walsender = true; + InitWalSender(); + InitProcessPhase2(); + + /* Create replication slot for WAL proposer if not exists */ + if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) + { + ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); + ReplicationSlotReserveWal(); + /* Write this slot to disk */ + ReplicationSlotMarkDirty(); + ReplicationSlotSave(); + ReplicationSlotRelease(); + } + + WalProposerStart(); +} + +/* + * Entry point for `postgres --sync-safekeepers`. + */ +void +WalProposerSync(int argc, char *argv[]) +{ + struct stat stat_buf; + // FIXME Write a comment, why this hardcoded value is safe + TimeLineID tli = 1; + + syncSafekeepers = true; + + InitStandaloneProcess(argv[0]); + + SetProcessingMode(InitProcessing); + + /* + * Set default values for command-line options. + */ + InitializeGUCOptions(); + + /* Acquire configuration parameters */ + if (!SelectConfigFiles(NULL, progname)) + exit(1); + + /* + * Imitate we are early in bootstrap loading shared_preload_libraries; + * zenith extension sets PGC_POSTMASTER gucs requiring this. + */ + process_shared_preload_libraries_in_progress = true; + + /* + * Initialize postmaster_alive_fds as WaitEventSet checks them. + * + * Copied from InitPostmasterDeathWatchHandle() + */ + if (pipe(postmaster_alive_fds) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); + + ChangeToDataDir(); + + /* Create pg_wal directory, if it doesn't exist */ + if (stat(XLOGDIR, &stat_buf) != 0) + { + ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); + if (MakePGDirectory(XLOGDIR) < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + XLOGDIR))); + exit(1); + } + } + + WalProposerInit(0, 0, tli); + + process_shared_preload_libraries_in_progress = false; + + BackgroundWorkerUnblockSignals(); + + WalProposerStart(); +} + +/* + * Create new AppendRequest message and start sending it. This function is + * called from walsender every time the new WAL is available. + */ +void +WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos) +{ + Assert(startpos == availableLsn && endpos >= availableLsn); + availableLsn = endpos; + BroadcastAppendRequest(); +} + +/* + * Advance the WAL proposer state machine, waiting each time for events to occur. + * Will exit only when latch is set, i.e. new WAL should be pushed from walsender + * to walproposer. + */ +void +WalProposerPoll(void) +{ + while (true) + { + Safekeeper *sk; + int rc; + WaitEvent event; + TimestampTz now = GetCurrentTimestamp(); + + rc = WaitEventSetWait(waitEvents, TimeToReconnect(now), + &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); + sk = (Safekeeper *) event.user_data; + + /* + * If the event contains something that one of our safekeeper states + * was waiting for, we'll advance its state. + */ + if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))) + AdvancePollState(sk, event.events); + + /* + * If the timeout expired, attempt to reconnect to any safekeepers that + * we dropped + */ + ReconnectSafekeepers(); + + /* + * If wait is terminated by latch set (walsenders' latch is set on + * each wal flush), then exit loop. (no need for pm death check due to + * WL_EXIT_ON_PM_DEATH) + */ + if (rc != 0 && (event.events & WL_LATCH_SET)) + { + ResetLatch(MyLatch); + break; + } + if (rc == 0) /* timeout expired: poll state */ + { + TimestampTz now; + + /* + * If no WAL was generated during timeout (and we have already + * collected the quorum), then send pool message + */ + if (availableLsn != InvalidXLogRecPtr) + { + BroadcastAppendRequest(); + } + + /* + * Abandon connection attempts which take too long. + */ + now = GetCurrentTimestamp(); + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + + if ((sk->state == SS_CONNECTING_WRITE || + sk->state == SS_CONNECTING_READ) && + TimestampDifferenceExceeds(sk->startedConnAt, now, + wal_acceptor_connect_timeout)) + { + elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", + sk->host, sk->port, wal_acceptor_connect_timeout); + ShutdownConnection(sk); + } + } + } + } +} + +/* + * Register a background worker proposing WAL to wal acceptors. + */ +void +WalProposerRegister(void) +{ + BackgroundWorker bgw; + + if (*wal_acceptors_list == '\0') + return; + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +static void +WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId, TimeLineID tli) +{ + char *host; + char *sep; + char *port; + + /* Load the libpq-specific functions */ + load_file("libpqwalproposer", false); + if (WalProposerFunctions == NULL) + elog(ERROR, "libpqwalproposer didn't initialize correctly"); + + load_file("libpqwalreceiver", false); + if (WalReceiverFunctions == NULL) + elog(ERROR, "libpqwalreceiver didn't initialize correctly"); + load_file("neon", false); + + for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep) + { + port = strchr(host, ':'); + if (port == NULL) + { + elog(FATAL, "port is not specified"); + } + *port++ = '\0'; + sep = strchr(port, ','); + if (sep != NULL) + *sep++ = '\0'; + if (n_safekeepers + 1 >= MAX_SAFEKEEPERS) + { + elog(FATAL, "Too many safekeepers"); + } + safekeeper[n_safekeepers].host = host; + safekeeper[n_safekeepers].port = port; + safekeeper[n_safekeepers].state = SS_OFFLINE; + safekeeper[n_safekeepers].conn = NULL; + + /* + * Set conninfo to empty. We'll fill it out once later, in + * `ResetConnection` as needed + */ + safekeeper[n_safekeepers].conninfo[0] = '\0'; + initStringInfo(&safekeeper[n_safekeepers].outbuf); + safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL); + if (safekeeper[n_safekeepers].xlogreader == NULL) + elog(FATAL, "Failed to allocate xlog reader"); + safekeeper[n_safekeepers].flushWrite = false; + safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr; + safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr; + n_safekeepers += 1; + } + if (n_safekeepers < 1) + { + elog(FATAL, "Safekeepers addresses are not specified"); + } + quorum = n_safekeepers / 2 + 1; + + /* Fill the greeting package */ + greetRequest.tag = 'g'; + greetRequest.protocolVersion = SK_PROTOCOL_VERSION; + greetRequest.pgVersion = PG_VERSION_NUM; + pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); + greetRequest.systemId = systemId; + if (!zenith_timeline_walproposer) + elog(FATAL, "neon.timeline_id is not provided"); + if (*zenith_timeline_walproposer != '\0' && + !HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16)) + elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer); + if (!zenith_tenant_walproposer) + elog(FATAL, "neon.tenant_id is not provided"); + if (*zenith_tenant_walproposer != '\0' && + !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) + elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); + + greetRequest.timeline = tli; + greetRequest.walSegSize = wal_segment_size; + + InitEventSet(); +} + +static void +WalProposerStart(void) +{ + + /* Initiate connections to all safekeeper nodes */ + for (int i = 0; i < n_safekeepers; i++) + { + ResetConnection(&safekeeper[i]); + } + + WalProposerLoop(); +} + +static void +WalProposerLoop(void) +{ + while (true) + WalProposerPoll(); +} + +/* Initializes the internal event set, provided that it is currently null */ +static void +InitEventSet(void) +{ + if (waitEvents) + elog(FATAL, "double-initialization of event set"); + + waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers); + AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); +} + +/* + * Updates the events we're already waiting on for the safekeeper, setting it to + * the provided `events` + * + * This function is called any time the safekeeper's state switches to one where + * it has to wait to continue. This includes the full body of AdvancePollState + * and calls to IO helper functions. + */ +static void +UpdateEventSet(Safekeeper *sk, uint32 events) +{ + /* eventPos = -1 when we don't have an event */ + Assert(sk->eventPos != -1); + + ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); +} + +/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set. + * + * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. + */ +static void +HackyRemoveWalProposerEvent(Safekeeper *to_remove) +{ + /* Remove the existing event set */ + if (waitEvents) + { + FreeWaitEventSet(waitEvents); + waitEvents = NULL; + } + /* Re-initialize it without adding any safekeeper events */ + InitEventSet(); + + /* + * loop through the existing safekeepers. If they aren't the one we're + * removing, and if they have a socket we can use, re-add the applicable + * events. + */ + for (int i = 0; i < n_safekeepers; i++) + { + uint32 desired_events = WL_NO_EVENTS; + Safekeeper *sk = &safekeeper[i]; + + sk->eventPos = -1; + + if (sk == to_remove) + continue; + + /* If this safekeeper isn't offline, add an event for it! */ + if (sk->conn != NULL) + { + desired_events = SafekeeperStateDesiredEvents(sk->state); + sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk); + } + } +} + +/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ +static void +ShutdownConnection(Safekeeper *sk) +{ + if (sk->conn) + walprop_finish(sk->conn); + sk->conn = NULL; + sk->state = SS_OFFLINE; + sk->flushWrite = false; + sk->streamingAt = InvalidXLogRecPtr; + + if (sk->voteResponse.termHistory.entries) + pfree(sk->voteResponse.termHistory.entries); + sk->voteResponse.termHistory.entries = NULL; + + HackyRemoveWalProposerEvent(sk); +} + +/* + * This function is called to establish new connection or to reestablish + * connection in case of connection failure. + * + * On success, sets the state to SS_CONNECTING_WRITE. + */ +static void +ResetConnection(Safekeeper *sk) +{ + pgsocket sock; /* socket of the new connection */ + + if (sk->state != SS_OFFLINE) + { + ShutdownConnection(sk); + } + + /* + * Try to establish new connection + * + * If the connection information hasn't been filled out, we need to do + * that here. + */ + if (sk->conninfo[0] == '\0') + { + int written = 0; + written = snprintf((char *) &sk->conninfo, MAXCONNINFO, + "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); + // currently connection string is not that long, but once we pass something like jwt we might overflow the buffer, + // so it is better to be defensive and check that everything aligns well + if (written > MAXCONNINFO || written < 0) + elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); + } + + sk->conn = walprop_connect_start((char *) &sk->conninfo); + + /* + * "If the result is null, then libpq has been unable to allocate a new + * PGconn structure" + */ + if (!sk->conn) + elog(FATAL, "failed to allocate new PGconn object"); + + /* + * PQconnectStart won't actually start connecting until we run + * PQconnectPoll. Before we do that though, we need to check that it + * didn't immediately fail. + */ + if (walprop_status(sk->conn) == WP_CONNECTION_BAD) + { + /*--- + * According to libpq docs: + * "If the result is CONNECTION_BAD, the connection attempt has already failed, + * typically because of invalid connection parameters." + * We should report this failure. + * + * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS + */ + elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s", + sk->conninfo, walprop_error_message(sk->conn)); + + /* + * Even though the connection failed, we still need to clean up the + * object + */ + walprop_finish(sk->conn); + sk->conn = NULL; + return; + } + + /* + * The documentation for PQconnectStart states that we should call + * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or + * PGRES_POLLING_FAILED. The other two possible returns indicate whether + * we should wait for reading or writing on the socket. For the first + * iteration of the loop, we're expected to wait until the socket becomes + * writable. + * + * The wording of the documentation is a little ambiguous; thankfully + * there's an example in the postgres source itself showing this behavior. + * (see libpqrcv_connect, defined in + * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) + */ + elog(LOG, "connecting with node %s:%s", sk->host, sk->port); + + sk->state = SS_CONNECTING_WRITE; + sk->startedConnAt = GetCurrentTimestamp(); + + sock = walprop_socket(sk->conn); + sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk); + return; +} + +/* + * How much milliseconds left till we should attempt reconnection to + * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect + * (do we actually need this?). + */ +static long +TimeToReconnect(TimestampTz now) +{ + TimestampTz passed; + TimestampTz till_reconnect; + + if (wal_acceptor_reconnect_timeout <= 0) + return -1; + + passed = now - last_reconnect_attempt; + till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed; + if (till_reconnect <= 0) + return 0; + return (long) (till_reconnect / 1000); +} + +/* If the timeout has expired, attempt to reconnect to all offline safekeepers */ +static void +ReconnectSafekeepers(void) +{ + TimestampTz now = GetCurrentTimestamp(); + + if (TimeToReconnect(now) == 0) + { + last_reconnect_attempt = now; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_OFFLINE) + ResetConnection(&safekeeper[i]); + } + } +} + +/* + * Performs the logic for advancing the state machine of the specified safekeeper, + * given that a certain set of events has occured. + */ +static void +AdvancePollState(Safekeeper *sk, uint32 events) +{ + /* + * Sanity check. We assume further down that the operations don't + * block because the socket is ready. + */ + AssertEventsOkForState(events, sk); + + /* Execute the code corresponding to the current state */ + switch (sk->state) + { + /* + * safekeepers are only taken out of SS_OFFLINE by calls to + * ResetConnection + */ + case SS_OFFLINE: + elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", + sk->host, sk->port); + break; /* actually unreachable, but prevents + * -Wimplicit-fallthrough */ + + /* + * Both connecting states run the same logic. The only + * difference is the events they're expecting + */ + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + HandleConnectionEvent(sk); + break; + + /* + * Waiting for a successful CopyBoth response. + */ + case SS_WAIT_EXEC_RESULT: + RecvStartWALPushResult(sk); + break; + + /* + * Finish handshake comms: receive information about the safekeeper. + */ + case SS_HANDSHAKE_RECV: + RecvAcceptorGreeting(sk); + break; + + /* + * Voting is an idle state - we don't expect any events to trigger. + * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are + * transferred from SS_VOTING to sending actual vote requests. + */ + case SS_VOTING: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* Read the safekeeper response for our candidate */ + case SS_WAIT_VERDICT: + RecvVoteResponse(sk); + break; + + /* Flush proposer announcement message */ + case SS_SEND_ELECTED_FLUSH: + + /* + * AsyncFlush ensures we only move on to SS_ACTIVE once the flush + * completes. If we still have more to do, we'll wait until the next + * poll comes along. + */ + if (!AsyncFlush(sk)) + return; + + /* flush is done, event set and state will be updated later */ + StartStreaming(sk); + break; + + /* + * Idle state for waiting votes from quorum. + */ + case SS_IDLE: + elog(WARNING, "EOF from node %s:%s in %s state", sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return; + + /* + * Active state is used for streaming WAL and receiving feedback. + */ + case SS_ACTIVE: + HandleActiveState(sk, events); + break; + } +} + +static void +HandleConnectionEvent(Safekeeper *sk) +{ + WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn); + + /* The new set of events we'll wait on, after updating */ + uint32 new_events = WL_NO_EVENTS; + + switch (result) + { + case WP_CONN_POLLING_OK: + elog(LOG, "connected with node %s:%s", sk->host, + sk->port); + + /* + * We have to pick some event to update event set. + * We'll eventually need the socket to be readable, + * so we go with that. + */ + new_events = WL_SOCKET_READABLE; + break; + + /* + * If we need to poll to finish connecting, + * continue doing that + */ + case WP_CONN_POLLING_READING: + sk->state = SS_CONNECTING_READ; + new_events = WL_SOCKET_READABLE; + break; + case WP_CONN_POLLING_WRITING: + sk->state = SS_CONNECTING_WRITE; + new_events = WL_SOCKET_WRITEABLE; + break; + + case WP_CONN_POLLING_FAILED: + elog(WARNING, "failed to connect to node '%s:%s': %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + + /* + * If connecting failed, we don't want to restart + * the connection because that might run us into a + * loop. Instead, shut it down -- it'll naturally + * restart at a slower interval on calls to + * ReconnectSafekeepers. + */ + ShutdownConnection(sk); + return; + } + + /* + * Because PQconnectPoll can change the socket, we have to + * un-register the old event and re-register an event on + * the new socket. + */ + HackyRemoveWalProposerEvent(sk); + sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); + + /* If we successfully connected, send START_WAL_PUSH query */ + if (result == WP_CONN_POLLING_OK) + SendStartWALPush(sk); +} + +/* + * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs + * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something + * goes wrong, change state to SS_OFFLINE and shutdown the connection. + */ +static void +SendStartWALPush(Safekeeper *sk) +{ + char *query = NULL; + if (zenith_pageserver_connstring_walproposer != NULL) { + query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer); + } else { + query = psprintf("START_WAL_PUSH"); + } + if (!walprop_send_query(sk->conn, query)) + { + pfree(query); + elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + } + pfree(query); + sk->state = SS_WAIT_EXEC_RESULT; + UpdateEventSet(sk, WL_SOCKET_READABLE); +} + +static void +RecvStartWALPushResult(Safekeeper *sk) +{ + switch (walprop_get_query_result(sk->conn)) + { + /* + * Successful result, move on to starting the + * handshake + */ + case WP_EXEC_SUCCESS_COPYBOTH: + + SendProposerGreeting(sk); + break; + + /* + * Needs repeated calls to finish. Wait until the + * socket is readable + */ + case WP_EXEC_NEEDS_INPUT: + + /* + * SS_WAIT_EXEC_RESULT is always reached through an + * event, so we don't need to update the event set + */ + break; + + case WP_EXEC_FAILED: + elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", + sk->host, sk->port, walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return; + + /* + * Unexpected result -- funamdentally an error, but we + * want to produce a custom message, rather than a + * generic "something went wrong" + */ + case WP_EXEC_UNEXPECTED_SUCCESS: + elog(WARNING, "Received bad response from safekeeper %s:%s query execution", + sk->host, sk->port); + ShutdownConnection(sk); + return; + } +} + +/* + * Start handshake: first of all send information about the + * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for + * a response to finish the handshake. + */ +static void +SendProposerGreeting(Safekeeper *sk) +{ + /* + * On failure, logging & resetting the connection is handled. + * We just need to handle the control flow. + */ + BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); +} + +static void +RecvAcceptorGreeting(Safekeeper *sk) +{ + /* + * If our reading doesn't immediately succeed, any necessary + * error handling or state setting is taken care of. We can + * leave any other work until later. + */ + sk->greetResponse.apm.tag = 'g'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) + return; + + /* Protocol is all good, move to voting. */ + sk->state = SS_VOTING; + + ++n_connected; + if (n_connected <= quorum) + { + /* We're still collecting terms from the majority. */ + propTerm = Max(sk->greetResponse.term, propTerm); + + /* Quorum is acquried, prepare the vote request. */ + if (n_connected == quorum) + { + propTerm++; + elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm); + + voteRequest = (VoteRequest) + { + .tag = 'v', + .term = propTerm + }; + memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN); + } + } + else if (sk->greetResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->greetResponse.term, propTerm); + } + + /* + * Check if we have quorum. If there aren't enough safekeepers, + * wait and do nothing. We'll eventually get a task when the + * election starts. + * + * If we do have quorum, we can start an election. + */ + if (n_connected < quorum) + { + /* + * SS_VOTING is an idle state; read-ready indicates the + * connection closed. + */ + UpdateEventSet(sk, WL_SOCKET_READABLE); + } + else + { + /* + * Now send voting request to the cohort and wait + * responses + */ + for (int j = 0; j < n_safekeepers; j++) + { + /* + * Remember: SS_VOTING indicates that the safekeeper is + * participating in voting, but hasn't sent anything + * yet. + */ + if (safekeeper[j].state == SS_VOTING) + SendVoteRequest(&safekeeper[j]); + } + } +} + +static void +SendVoteRequest(Safekeeper *sk) +{ + /* We have quorum for voting, send our vote request */ + elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term); + /* On failure, logging & resetting is handled */ + if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT)) + return; + + /* If successful, wait for read-ready with SS_WAIT_VERDICT */ +} + +static void +RecvVoteResponse(Safekeeper *sk) +{ + sk->voteResponse.apm.tag = 'v'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) + return; + + elog(LOG, + "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", + sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), + LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), + LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), + LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); + + /* + * In case of acceptor rejecting our vote, bail out, but only + * if either it already lives in strictly higher term + * (concurrent compute spotted) or we are not elected yet and + * thus need the vote. + */ + if ((!sk->voteResponse.voteGiven) && + (sk->voteResponse.term > propTerm || n_votes < quorum)) + { + elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", + sk->host, sk->port, + sk->voteResponse.term, propTerm); + } + Assert(sk->voteResponse.term == propTerm); + + /* Handshake completed, do we have quorum? */ + n_votes++; + if (n_votes < quorum) + { + sk->state = SS_IDLE; /* can't do much yet, no quorum */ + } + else if (n_votes > quorum) + { + /* recovery already performed, just start streaming */ + SendProposerElected(sk); + } + else + { + sk->state = SS_IDLE; + UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for + * read-ready */ + + HandleElectedProposer(); + } +} + +/* + * Called once a majority of acceptors have voted for us and current proposer + * has been elected. + * + * Sends ProposerElected message to all acceptors in SS_IDLE state and starts + * replication from walsender. + */ +static void +HandleElectedProposer(void) +{ + DetermineEpochStartLsn(); + + /* + * Check if not all safekeepers are up-to-date, we need to + * download WAL needed to synchronize them + */ + if (truncateLsn < propEpochStartLsn) + { + elog(LOG, + "start recovery because truncateLsn=%X/%X is not " + "equal to epochStartLsn=%X/%X", + LSN_FORMAT_ARGS(truncateLsn), + LSN_FORMAT_ARGS(propEpochStartLsn)); + /* Perform recovery */ + if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) + elog(FATAL, "Failed to recover state"); + } + else if (syncSafekeepers) + { + /* Sync is not needed: just exit */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + SendProposerElected(&safekeeper[i]); + } + + /* + * The proposer has been elected, and there will be no quorum waiting + * after this point. There will be no safekeeper with state SS_IDLE + * also, because that state is used only for quorum waiting. + */ + + if (syncSafekeepers) + { + /* + * Send empty message to enforce receiving feedback + * even from nodes who are fully recovered; this is + * required to learn they switched epoch which finishes + * sync-safeekepers who doesn't generate any real new + * records. Will go away once we switch to async acks. + */ + BroadcastAppendRequest(); + + /* keep polling until all safekeepers are synced */ + return; + } + + WalProposerStartStreaming(propEpochStartLsn); + /* Should not return here */ +} + +/* latest term in TermHistory, or 0 is there is no entries */ +static term_t +GetHighestTerm(TermHistory *th) +{ + return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; +} + +/* safekeeper's epoch is the term of the highest entry in the log */ +static term_t +GetEpoch(Safekeeper *sk) +{ + return GetHighestTerm(&sk->voteResponse.termHistory); +} + +/* If LSN points to the page header, skip it */ +static XLogRecPtr +SkipXLogPageHeader(XLogRecPtr lsn) +{ + if (XLogSegmentOffset(lsn, wal_segment_size) == 0) + { + lsn += SizeOfXLogLongPHD; + } + else if (lsn % XLOG_BLCKSZ == 0) + { + lsn += SizeOfXLogShortPHD; + } + return lsn; +} + +/* + * Called after majority of acceptors gave votes, it calculates the most + * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since + * which we'll write WAL in our term. + * + * Sets truncateLsn along the way (though it is not of much use at this point -- + * only for skipping recovery). + */ +static void +DetermineEpochStartLsn(void) +{ + TermHistory *dth; + + propEpochStartLsn = InvalidXLogRecPtr; + donorEpoch = 0; + truncateLsn = InvalidXLogRecPtr; + timelineStartLsn = InvalidXLogRecPtr; + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].state == SS_IDLE) + { + if (GetEpoch(&safekeeper[i]) > donorEpoch || + (GetEpoch(&safekeeper[i]) == donorEpoch && + safekeeper[i].voteResponse.flushLsn > propEpochStartLsn)) + { + donorEpoch = GetEpoch(&safekeeper[i]); + propEpochStartLsn = safekeeper[i].voteResponse.flushLsn; + donor = i; + } + truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn); + + if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) + { + /* timelineStartLsn should be the same everywhere or unknown */ + if (timelineStartLsn != InvalidXLogRecPtr && + timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn) + { + elog(WARNING, + "inconsistent timelineStartLsn: current %X/%X, received %X/%X", + LSN_FORMAT_ARGS(timelineStartLsn), + LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn)); + } + timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn; + } + } + } + + /* + * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was + * committed yet. Start streaming then from the basebackup LSN. + */ + if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) + { + propEpochStartLsn = truncateLsn = GetRedoStartLsn(); + if (timelineStartLsn == InvalidXLogRecPtr) + { + timelineStartLsn = GetRedoStartLsn(); + } + elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn)); + } + + /* + * If propEpochStartLsn is not 0, at least one msg with WAL was sent to + * some connected safekeeper; it must have carried truncateLsn pointing to + * the first record. + */ + Assert((truncateLsn != InvalidXLogRecPtr) || + (syncSafekeepers && truncateLsn == propEpochStartLsn)); + + /* + * We will be generating WAL since propEpochStartLsn, so we should set + * availableLsn to mark this LSN as the latest available position. + */ + availableLsn = propEpochStartLsn; + + /* + * Proposer's term history is the donor's + its own entry. + */ + dth = &safekeeper[donor].voteResponse.termHistory; + propTermHistory.n_entries = dth->n_entries + 1; + propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries); + memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); + propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm; + propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn; + + elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", + quorum, + propTerm, + LSN_FORMAT_ARGS(propEpochStartLsn), + safekeeper[donor].host, safekeeper[donor].port, + LSN_FORMAT_ARGS(truncateLsn) + ); + + /* + * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since + * which we are going to write according to the consensus. If not, we must + * bail out, as clog and other non rel data is inconsistent. + */ + if (!syncSafekeepers) + { + /* + * Basebackup LSN always points to the beginning of the record (not the + * page), as StartupXLOG most probably wants it this way. Safekeepers + * don't skip header as they need continious stream of data, so + * correct LSN for comparison. + */ + if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) + { + /* + * However, allow to proceed if previously elected leader was me; plain + * restart of walproposer not intervened by concurrent compute (who could + * generate WAL) is ok. + */ + if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == + walprop_shared->mineLastElectedTerm))) + { + elog(PANIC, + "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", + LSN_FORMAT_ARGS(propEpochStartLsn), + LSN_FORMAT_ARGS(GetRedoStartLsn())); + } + } + walprop_shared->mineLastElectedTerm = propTerm; + } +} + +/* + * Receive WAL from most advanced safekeeper + */ +static bool +WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) +{ + char conninfo[MAXCONNINFO]; + char *err; + WalReceiverConn *wrconn; + WalRcvStreamOptions options; + + sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", + safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer); + wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); + if (!wrconn) + { + ereport(WARNING, + (errmsg("could not connect to WAL acceptor %s:%s: %s", + safekeeper[donor].host, safekeeper[donor].port, + err))); + return false; + } + elog(LOG, + "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " + "%d", + safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32), + (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); + + options.logical = false; + options.startpoint = startpos; + options.slotname = NULL; + options.proto.physical.startpointTLI = timeline; + + if (walrcv_startstreaming(wrconn, &options)) + { + XLogRecPtr rec_start_lsn; + XLogRecPtr rec_end_lsn = 0; + int len; + char *buf; + pgsocket wait_fd = PGINVALID_SOCKET; + + while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) + { + if (len == 0) + { + (void) WaitLatchOrSocket( + MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, + -1, WAIT_EVENT_WAL_RECEIVER_MAIN); + } + else + { + Assert(buf[0] == 'w' || buf[0] == 'k'); + if (buf[0] == 'k') + continue; /* keepalive */ + memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], + sizeof rec_start_lsn); + rec_start_lsn = pg_ntoh64(rec_start_lsn); + rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; + + /* write WAL to disk */ + XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); + + ereport(DEBUG1, + (errmsg("Recover message %X/%X length %d", + LSN_FORMAT_ARGS(rec_start_lsn), len))); + if (rec_end_lsn >= endpos) + break; + } + } + ereport(LOG, + (errmsg("end of replication stream at %X/%X: %m", + LSN_FORMAT_ARGS(rec_end_lsn)))); + walrcv_disconnect(wrconn); + + /* failed to receive all WAL till endpos */ + if (rec_end_lsn < endpos) + return false; + } + else + { + ereport(LOG, + (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", + timeline, (uint32) (startpos >> 32), (uint32) startpos))); + return false; + } + + return true; +} + +/* + * Determine for sk the starting streaming point and send it message + * 1) Announcing we are elected proposer (which immediately advances epoch if + * safekeeper is synced, being important for sync-safekeepers) + * 2) Communicating starting streaming point -- safekeeper must truncate its WAL + * beyond it -- and history of term switching. + * + * Sets sk->startStreamingAt. + */ +static void +SendProposerElected(Safekeeper *sk) +{ + ProposerElected msg; + TermHistory *th; + term_t lastCommonTerm; + int i; + + /* + * Determine start LSN by comparing safekeeper's log term switch history and + * proposer's, searching for the divergence point. + * + * Note: there is a vanishingly small chance of no common point even if + * there is some WAL on safekeeper, if immediately after bootstrap compute + * wrote some WAL on single sk and died; we stream since the beginning then. + */ + th = &sk->voteResponse.termHistory; + /* + * If any WAL is present on the sk, it must be authorized by some term. + * OTOH, without any WAL there are no term swiches in the log. + */ + Assert((th->n_entries == 0) == + (sk->voteResponse.flushLsn == InvalidXLogRecPtr)); + /* We must start somewhere. */ + Assert(propTermHistory.n_entries >= 1); + + for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++) + { + if (propTermHistory.entries[i].term != th->entries[i].term) + break; + /* term must begin everywhere at the same point */ + Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); + } + i--; /* step back to the last common term */ + if (i < 0) + { + /* safekeeper is empty or no common point, start from the beginning */ + sk->startStreamingAt = propTermHistory.entries[0].lsn; + + if (sk->startStreamingAt < truncateLsn) + { + /* + * There's a gap between the WAL starting point and a truncateLsn, + * which can't appear in a normal working cluster. That gap means + * that all safekeepers reported that they have persisted WAL up + * to the truncateLsn before, but now current safekeeper tells + * otherwise. + * + * Also we have a special condition here, which is empty safekeeper + * with no history. In combination with a gap, that can happen when + * we introduce a new safekeeper to the cluster. This is a rare case, + * which is triggered manually for now, and should be treated with + * care. + */ + + /* + * truncateLsn will not change without ack from current safekeeper, + * and it's aligned to the WAL record, so we can safely start + * streaming from this point. + */ + sk->startStreamingAt = truncateLsn; + + elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", + sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn), + LSN_FORMAT_ARGS(sk->startStreamingAt)); + } + } + else + { + /* + * End of (common) term is the start of the next except it is the last + * one; there it is flush_lsn in case of safekeeper or, in case of + * proposer, LSN it is currently writing, but then we just pick + * safekeeper pos as it obviously can't be higher. + */ + if (propTermHistory.entries[i].term == propTerm) + { + sk->startStreamingAt = sk->voteResponse.flushLsn; + } + else + { + XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; + XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : + sk->voteResponse.flushLsn); + sk->startStreamingAt = Min(propEndLsn, skEndLsn); + } + } + + Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn); + + msg.tag = 'e'; + msg.term = propTerm; + msg.startStreamingAt = sk->startStreamingAt; + msg.termHistory = &propTermHistory; + msg.timelineStartLsn = timelineStartLsn; + + lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0; + elog(LOG, + "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", + sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); + + resetStringInfo(&sk->outbuf); + pq_sendint64_le(&sk->outbuf, msg.tag); + pq_sendint64_le(&sk->outbuf, msg.term); + pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); + pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); + for (int i = 0; i < msg.termHistory->n_entries; i++) + { + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); + pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); + } + pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); + + if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) + return; + + StartStreaming(sk); +} + +/* + * Start walsender streaming replication + */ +static void +WalProposerStartStreaming(XLogRecPtr startpos) +{ + StartReplicationCmd cmd; + + elog(LOG, "WAL proposer starts streaming at %X/%X", + LSN_FORMAT_ARGS(startpos)); + cmd.slotname = WAL_PROPOSER_SLOT_NAME; + cmd.timeline = greetRequest.timeline; + cmd.startpoint = startpos; + StartReplication(&cmd); +} + +/* + * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets + * correct event set. + */ +static void +StartStreaming(Safekeeper *sk) +{ + /* + * This is the only entrypoint to state SS_ACTIVE. It's executed + * exactly once for a connection. + */ + sk->state = SS_ACTIVE; + sk->streamingAt = sk->startStreamingAt; + + /* event set will be updated inside SendMessageToNode */ + SendMessageToNode(sk); +} + +/* + * Try to send message to the particular node. Always updates event set. Will + * send at least one message, if socket is ready. + * + * Can be used only for safekeepers in SS_ACTIVE state. State can be changed + * in case of errors. + */ +static void +SendMessageToNode(Safekeeper *sk) +{ + Assert(sk->state == SS_ACTIVE); + + /* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */ + HandleActiveState(sk, WL_SOCKET_WRITEABLE); +} + +/* + * Broadcast new message to all caught-up safekeepers + */ +static void +BroadcastAppendRequest() +{ + for (int i = 0; i < n_safekeepers; i++) + if (safekeeper[i].state == SS_ACTIVE) + SendMessageToNode(&safekeeper[i]); +} + +static void +PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) +{ + Assert(endLsn >= beginLsn); + req->tag = 'a'; + req->term = propTerm; + req->epochStartLsn = propEpochStartLsn; + req->beginLsn = beginLsn; + req->endLsn = endLsn; + req->commitLsn = GetAcknowledgedByQuorumWALPosition(); + req->truncateLsn = truncateLsn; + req->proposerId = greetRequest.proposerId; +} + +/* + * Process all events happened in SS_ACTIVE state, update event set after that. + */ +static void +HandleActiveState(Safekeeper *sk, uint32 events) +{ + uint32 newEvents = WL_SOCKET_READABLE; + + if (events & WL_SOCKET_WRITEABLE) + if (!SendAppendRequests(sk)) + return; + + if (events & WL_SOCKET_READABLE) + if (!RecvAppendResponses(sk)) + return; + + /* + * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data + * in the buffer. + * + * LSN comparison checks if we have pending unsent messages. This check isn't + * necessary now, because we always send append messages immediately after + * arrival. But it's good to have it here in case we change this behavior + * in the future. + */ + if (sk->streamingAt != availableLsn || sk->flushWrite) + newEvents |= WL_SOCKET_WRITEABLE; + + UpdateEventSet(sk, newEvents); +} + +/* + * Send WAL messages starting from sk->streamingAt until the end or non-writable + * socket, whichever comes first. Caller should take care of updating event set. + * Even if no unsent WAL is available, at least one empty message will be sent + * as a heartbeat, if socket is ready. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + */ +static bool +SendAppendRequests(Safekeeper *sk) +{ + XLogRecPtr endLsn; + AppendRequestHeader *req; + PGAsyncWriteResult writeResult; + WALReadError errinfo; + bool sentAnything = false; + // FIXME Is it ok to use hardcoded value here? + TimeLineID tli = 1; + + if (sk->flushWrite) + { + if (!AsyncFlush(sk)) + /* + * AsyncFlush failed, that could happen if the socket is closed or + * we have nothing to write and should wait for writeable socket. + */ + return sk->state == SS_ACTIVE; + + /* Event set will be updated in the end of HandleActiveState */ + sk->flushWrite = false; + } + + while (sk->streamingAt != availableLsn || !sentAnything) + { + sentAnything = true; + + endLsn = sk->streamingAt; + endLsn += MAX_SEND_SIZE; + + /* if we went beyond available WAL, back off */ + if (endLsn > availableLsn) { + endLsn = availableLsn; + } + + req = &sk->appendRequest; + PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn); + + ereport(DEBUG2, + (errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", + req->endLsn - req->beginLsn, + LSN_FORMAT_ARGS(req->beginLsn), + LSN_FORMAT_ARGS(req->endLsn), + LSN_FORMAT_ARGS(req->commitLsn), + LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port))); + + resetStringInfo(&sk->outbuf); + + /* write AppendRequest header */ + appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader)); + + /* write the WAL itself */ + enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); + if (!WALRead(sk->xlogreader, + &sk->outbuf.data[sk->outbuf.len], + req->beginLsn, + req->endLsn - req->beginLsn, + tli, + &errinfo)) + { + WALReadRaiseError(&errinfo); + } + sk->outbuf.len += req->endLsn - req->beginLsn; + + writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len); + + /* Mark current message as sent, whatever the result is */ + sk->streamingAt = endLsn; + + switch (writeResult) + { + case PG_ASYNC_WRITE_SUCCESS: + /* Continue writing the next message */ + break; + + case PG_ASYNC_WRITE_TRY_FLUSH: + /* + * We still need to call PQflush some more to finish the job. + * Caller function will handle this by setting right event set. + */ + sk->flushWrite = true; + return true; + + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } + } + + return true; +} + +/* + * Receive and process all available feedback. + * + * Can change state if Async* functions encounter errors and reset connection. + * Returns false in this case, true otherwise. + * + * NB: This function can call SendMessageToNode and produce new messages. + */ +static bool +RecvAppendResponses(Safekeeper *sk) +{ + XLogRecPtr minQuorumLsn; + bool readAnything = false; + + while (true) + { + /* + * If our reading doesn't immediately succeed, any + * necessary error handling or state setting is taken care + * of. We can leave any other work until later. + */ + sk->appendResponse.apm.tag = 'a'; + if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) + break; + + ereport(DEBUG2, + (errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", + sk->appendResponse.term, + LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), + LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), + sk->host, sk->port))); + + if (sk->appendResponse.term > propTerm) + { + /* Another compute with higher term is running. */ + elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", + sk->host, sk->port, + sk->appendResponse.term, propTerm); + } + + readAnything = true; + } + + if (!readAnything) + return sk->state == SS_ACTIVE; + + HandleSafekeeperResponse(); + + /* + * Also send the new commit lsn to all the safekeepers. + */ + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + if (minQuorumLsn > lastSentCommitLsn) + { + BroadcastAppendRequest(); + lastSentCommitLsn = minQuorumLsn; + } + + return sk->state == SS_ACTIVE; +} + +/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +void +ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf) +{ + uint8 nkeys; + int i; + int32 len; + + /* get number of custom keys */ + nkeys = pq_getmsgbyte(reply_message); + + for (i = 0; i < nkeys; i++) + { + const char *key = pq_getmsgstring(reply_message); + if (strcmp(key, "current_timeline_size") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->currentClusterSize = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + rf->currentClusterSize); + } + else if (strcmp(key, "ps_writelsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_writelsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_writelsn)); + } + else if (strcmp(key, "ps_flushlsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_flushlsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_flushlsn)); + } + else if (strcmp(key, "ps_applylsn") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_applylsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", + LSN_FORMAT_ARGS(rf->ps_applylsn)); + } + else if (strcmp(key, "ps_replytime") == 0) + { + pq_getmsgint(reply_message, sizeof(int32)); // read value length + rf->ps_replytime = pq_getmsgint64(reply_message); + { + char *replyTimeStr; + + /* Copy because timestamptz_to_str returns a static buffer */ + replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); + elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", + rf->ps_replytime, replyTimeStr); + + pfree(replyTimeStr); + } + } + else + { + len = pq_getmsgint(reply_message, sizeof(int32)); // read value length + // Skip unknown keys to support backward compatibile protocol changes + elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + pq_getmsgbytes(reply_message, len); + }; + } +} + +/* + * Combine hot standby feedbacks from all safekeepers. + */ +static void +CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) +{ + hs->ts = 0; + hs->xmin.value = ~0; /* largest unsigned value */ + hs->catalog_xmin.value = ~0; /* largest unsigned value */ + + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.hs.ts != 0) + { + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) + { + hs->xmin = safekeeper[i].appendResponse.hs.xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) + { + hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; + hs->ts = safekeeper[i].appendResponse.hs.ts; + } + } + } +} + + +/* + * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the + * last WAL record that can be safely discarded. + */ +static XLogRecPtr +CalculateMinFlushLsn(void) +{ + XLogRecPtr lsn = UnknownXLogRecPtr; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.flushLsn < lsn) + lsn = safekeeper[i].appendResponse.flushLsn; + } + return lsn; +} + +/* + * Calculate WAL position acknowledged by quorum + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(void) +{ + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to epochStartLsn. + */ + responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? + safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[n_safekeepers - quorum]; +} + +/* + * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + */ +Size +WalproposerShmemSize(void) +{ + return sizeof(WalproposerShmemState); +} + +bool +WalproposerShmemInit(void) +{ + bool found; + + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + walprop_shared = ShmemInitStruct("Walproposer shared state", + sizeof(WalproposerShmemState), + &found); + + if (!found) + { + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + } + LWLockRelease(AddinShmemInitLock); + + return found; +} + +void +replication_feedback_set(ReplicationFeedback *rf) +{ + SpinLockAcquire(&walprop_shared->mutex); + memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + SpinLockRelease(&walprop_shared->mutex); +} + + +void +replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) +{ + SpinLockAcquire(&walprop_shared->mutex); + *writeLsn = walprop_shared->feedback.ps_writelsn; + *flushLsn = walprop_shared->feedback.ps_flushlsn; + *applyLsn = walprop_shared->feedback.ps_applylsn; + SpinLockRelease(&walprop_shared->mutex); +} + + +/* + * Get ReplicationFeedback fields from the most advanced safekeeper + */ +static void +GetLatestZentihFeedback(ReplicationFeedback *rf) +{ + int latest_safekeeper = 0; + XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + for (int i = 0; i < n_safekeepers; i++) + { + if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + { + latest_safekeeper = i; + ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + } + } + + rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; + rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; + rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; + rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; + rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + + elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," + " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + rf->currentClusterSize, + LSN_FORMAT_ARGS(rf->ps_writelsn), + LSN_FORMAT_ARGS(rf->ps_flushlsn), + LSN_FORMAT_ARGS(rf->ps_applylsn), + rf->ps_replytime); + + replication_feedback_set(rf); +} + +static void +HandleSafekeeperResponse(void) +{ + HotStandbyFeedback hsFeedback; + XLogRecPtr minQuorumLsn; + XLogRecPtr diskConsistentLsn; + XLogRecPtr minFlushLsn; + + + minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); + diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + + if (!syncSafekeepers) + { + // Get ReplicationFeedback fields from the most advanced safekeeper + GetLatestZentihFeedback(&quorumFeedback.rf); + SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); + } + + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + { + + if (minQuorumLsn > quorumFeedback.flushLsn) + quorumFeedback.flushLsn = minQuorumLsn; + + /* advance the replication slot */ + if (!syncSafekeepers) + ProcessStandbyReply( + // write_lsn - This is what durably stored in WAL service. + quorumFeedback.flushLsn, + //flush_lsn - This is what durably stored in WAL service. + quorumFeedback.flushLsn, + //apply_lsn - This is what processed and durably saved at pageserver. + quorumFeedback.rf.ps_flushlsn, + GetCurrentTimestamp(), false); + } + + CombineHotStanbyFeedbacks(&hsFeedback); + if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + { + quorumFeedback.hs = hsFeedback; + if (!syncSafekeepers) + ProcessStandbyHSFeedback(hsFeedback.ts, + XidFromFullTransactionId(hsFeedback.xmin), + EpochFromFullTransactionId(hsFeedback.xmin), + XidFromFullTransactionId(hsFeedback.catalog_xmin), + EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + } + + /* + * Try to advance truncateLsn to minFlushLsn, which is the last record + * flushed to all safekeepers. We must always start streaming from the + * beginning of the record, which simplifies decoding on the far end. + * + * Advanced truncateLsn should be not further than nearest commitLsn. + * This prevents surprising violation of truncateLsn <= commitLsn + * invariant which might occur because 1) truncateLsn can be advanced + * immediately once chunk is broadcast to all safekeepers, and + * commitLsn generally can't be advanced based on feedback from + * safekeeper who is still in the previous epoch (similar to 'leader + * can't commit entries from previous term' in Raft); 2) chunks we + * read from WAL and send are plain sheets of bytes, but safekeepers + * ack only on record boundaries. + */ + minFlushLsn = CalculateMinFlushLsn(); + if (minFlushLsn > truncateLsn) + { + truncateLsn = minFlushLsn; + + /* + * Advance the replication slot to free up old WAL files. Note + * that slot doesn't exist if we are in syncSafekeepers mode. + */ + if (MyReplicationSlot) + PhysicalConfirmReceivedLocation(truncateLsn); + } + + /* + * Generally sync is done when majority switched the epoch so we committed + * epochStartLsn and made the majority aware of it, ensuring they are + * ready to give all WAL to pageserver. It would mean whichever majority + * is alive, there will be at least one safekeeper who is able to stream + * WAL to pageserver to make basebackup possible. However, since at the + * moment we don't have any good mechanism of defining the healthy and + * most advanced safekeeper who should push the wal into pageserver and + * basically the random one gets connected, to prevent hanging basebackup + * (due to pageserver connecting to not-synced-safekeeper) we currently + * wait for all seemingly alive safekeepers to get synced. + */ + if (syncSafekeepers) + { + int n_synced; + + n_synced = 0; + for (int i = 0; i < n_safekeepers; i++) + { + Safekeeper *sk = &safekeeper[i]; + bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; + + /* alive safekeeper which is not synced yet; wait for it */ + if (sk->state != SS_OFFLINE && !synced) + return; + if (synced) + n_synced++; + } + if (n_synced >= quorum) + { + /* All safekeepers synced! */ + fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); + exit(0); + } + } +} + +/* + * Try to read CopyData message from i'th safekeeper, resetting connection on + * failure. + */ +static bool +AsyncRead(Safekeeper *sk, char **buf, int *buf_size) +{ + switch (walprop_async_read(sk->conn, buf, buf_size)) + { + case PG_ASYNC_READ_SUCCESS: + return true; + + case PG_ASYNC_READ_TRY_AGAIN: + /* WL_SOCKET_READABLE is always set during copyboth */ + return false; + + case PG_ASYNC_READ_FAIL: + elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, + sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + Assert(false); + return false; +} + +/* + * Read next message with known type into provided struct, by reading a CopyData + * block from the safekeeper's postgres connection, returning whether the read + * was successful. + * + * If the read needs more polling, we return 'false' and keep the state + * unmodified, waiting until it becomes read-ready to try again. If it fully + * failed, a warning is emitted and the connection is reset. + */ +static bool +AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) +{ + char *buf; + int buf_size; + uint64 tag; + StringInfoData s; + + if (!(AsyncRead(sk, &buf, &buf_size))) + return false; + + /* parse it */ + s.data = buf; + s.len = buf_size; + s.cursor = 0; + + tag = pq_getmsgint64_le(&s); + if (tag != anymsg->tag) + { + elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, + sk->port, FormatSafekeeperState(sk->state)); + ResetConnection(sk); + return false; + } + + switch (tag) + { + case 'g': + { + AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; + msg->term = pq_getmsgint64_le(&s); + msg->nodeId = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'v': + { + VoteResponse *msg = (VoteResponse *) anymsg; + + msg->term = pq_getmsgint64_le(&s); + msg->voteGiven = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->truncateLsn = pq_getmsgint64_le(&s); + msg->termHistory.n_entries = pq_getmsgint32_le(&s); + msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); + for (int i = 0; i < msg->termHistory.n_entries; i++) + { + msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); + msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); + } + msg->timelineStartLsn = pq_getmsgint64_le(&s); + pq_getmsgend(&s); + return true; + } + + case 'a': + { + AppendResponse *msg = (AppendResponse *) anymsg; + msg->term = pq_getmsgint64_le(&s); + msg->flushLsn = pq_getmsgint64_le(&s); + msg->commitLsn = pq_getmsgint64_le(&s); + msg->hs.ts = pq_getmsgint64_le(&s); + msg->hs.xmin.value = pq_getmsgint64_le(&s); + msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); + if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) + ParseReplicationFeedbackMessage(&s, &msg->rf); + pq_getmsgend(&s); + return true; + } + + default: + { + Assert(false); + return false; + } + } +} + +/* + * Blocking equivalent to AsyncWrite. + * + * We use this everywhere messages are small enough that they should fit in a + * single packet. + */ +static bool +BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) +{ + uint32 events; + + if (!walprop_blocking_write(sk->conn, msg, msg_size)) + { + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + } + + sk->state = success_state; + + /* + * If the new state will be waiting for events to happen, update the event + * set to wait for those + */ + events = SafekeeperStateDesiredEvents(success_state); + if (events) + UpdateEventSet(sk, events); + + return true; +} + +/* + * Starts a write into the 'i'th safekeeper's postgres connection, moving to + * flush_state (adjusting eventset) if write still needs flushing. + * + * Returns false if sending is unfinished (requires flushing or conn failed). + * Upon failure, a warning is emitted and the connection is reset. + */ +static bool +AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state) +{ + switch (walprop_async_write(sk->conn, msg, msg_size)) + { + case PG_ASYNC_WRITE_SUCCESS: + return true; + case PG_ASYNC_WRITE_TRY_FLUSH: + + /* + * We still need to call PQflush some more to finish the job; go + * to the appropriate state. Update the event set at the bottom of + * this function + */ + sk->state = flush_state; + UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); + return false; + case PG_ASYNC_WRITE_FAIL: + elog(WARNING, "Failed to send to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ShutdownConnection(sk); + return false; + default: + Assert(false); + return false; + } +} + +/* + * Flushes a previous call to AsyncWrite. This only needs to be called when the + * socket becomes read or write ready *after* calling AsyncWrite. + * + * If flushing successfully completes returns true, otherwise false. Event set + * is updated only if connection fails, otherwise caller should manually unset + * WL_SOCKET_WRITEABLE. + */ +static bool +AsyncFlush(Safekeeper *sk) +{ + /*--- + * PQflush returns: + * 0 if successful [we're good to move on] + * 1 if unable to send everything yet [call PQflush again] + * -1 if it failed [emit an error] + */ + switch (walprop_flush(sk->conn)) + { + case 0: + /* flush is done */ + return true; + case 1: + /* Nothing to do; try again when the socket's ready */ + return false; + case -1: + elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s", + sk->host, sk->port, FormatSafekeeperState(sk->state), + walprop_error_message(sk->conn)); + ResetConnection(sk); + return false; + default: + Assert(false); + return false; + } +} diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c new file mode 100644 index 00000000000..3ad21feff10 --- /dev/null +++ b/src/backend/replication/walproposer_utils.c @@ -0,0 +1,404 @@ +#include "postgres.h" + +#include "replication/walproposer.h" +#include "libpq/pqformat.h" +#include "common/logging.h" +#include "common/ip.h" +#include "../interfaces/libpq/libpq-fe.h" +#include +#include + +/* + * These variables are used similarly to openLogFile/SegNo, + * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID + * corresponding the filename of walpropFile. + */ +static int walpropFile = -1; +static TimeLineID walpropFileTLI = 0; +static XLogSegNo walpropSegNo = 0; + +int +CompareLsn(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 < lsn2) + return -1; + else if (lsn1 == lsn2) + return 0; + else + return 1; +} + +/* Returns a human-readable string corresonding to the SafekeeperState + * + * The string should not be freed. + * + * The strings are intended to be used as a prefix to "state", e.g.: + * + * elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); + * + * If this sort of phrasing doesn't fit the message, instead use something like: + * + * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); + */ +char* +FormatSafekeeperState(SafekeeperState state) +{ + char* return_val = NULL; + + switch (state) + { + case SS_OFFLINE: + return_val = "offline"; + break; + case SS_CONNECTING_READ: + case SS_CONNECTING_WRITE: + return_val = "connecting"; + break; + case SS_WAIT_EXEC_RESULT: + return_val = "receiving query result"; + break; + case SS_HANDSHAKE_RECV: + return_val = "handshake (receiving)"; + break; + case SS_VOTING: + return_val = "voting"; + break; + case SS_WAIT_VERDICT: + return_val = "wait-for-verdict"; + break; + case SS_SEND_ELECTED_FLUSH: + return_val = "send-announcement-flush"; + break; + case SS_IDLE: + return_val = "idle"; + break; + case SS_ACTIVE: + return_val = "active"; + break; + } + + Assert(return_val != NULL); + + return return_val; +} + +/* Asserts that the provided events are expected for given safekeeper's state */ +void +AssertEventsOkForState(uint32 events, Safekeeper* sk) +{ + uint32 expected = SafekeeperStateDesiredEvents(sk->state); + + /* The events are in-line with what we're expecting, under two conditions: + * (a) if we aren't expecting anything, `events` has no read- or + * write-ready component. + * (b) if we are expecting something, there's overlap + * (i.e. `events & expected != 0`) + */ + bool events_ok_for_state; /* long name so the `Assert` is more clear later */ + + if (expected == WL_NO_EVENTS) + events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0); + else + events_ok_for_state = ((events & expected) != 0); + + if (!events_ok_for_state) + { + /* To give a descriptive message in the case of failure, we use elog and + * then an assertion that's guaranteed to fail. */ + elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", + FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); + Assert(events_ok_for_state); + } +} + +/* Returns the set of events a safekeeper in this state should be waiting on + * + * This will return WL_NO_EVENTS (= 0) for some events. */ +uint32 +SafekeeperStateDesiredEvents(SafekeeperState state) +{ + uint32 result = WL_NO_EVENTS; + + /* If the state doesn't have a modifier, we can check the base state */ + switch (state) + { + /* Connecting states say what they want in the name */ + case SS_CONNECTING_READ: + result = WL_SOCKET_READABLE; + break; + case SS_CONNECTING_WRITE: + result = WL_SOCKET_WRITEABLE; + break; + + /* Reading states need the socket to be read-ready to continue */ + case SS_WAIT_EXEC_RESULT: + case SS_HANDSHAKE_RECV: + case SS_WAIT_VERDICT: + result = WL_SOCKET_READABLE; + break; + + /* Idle states use read-readiness as a sign that the connection has been + * disconnected. */ + case SS_VOTING: + case SS_IDLE: + result = WL_SOCKET_READABLE; + break; + + /* + * Flush states require write-ready for flushing. + * Active state does both reading and writing. + * + * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should + * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. + */ + case SS_SEND_ELECTED_FLUSH: + case SS_ACTIVE: + result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; + break; + + /* The offline state expects no events. */ + case SS_OFFLINE: + result = WL_NO_EVENTS; + break; + + default: + Assert(false); + break; + } + + return result; +} + +/* Returns a human-readable string corresponding to the event set + * + * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the + * returned string may be meaingless. + * + * The string should not be freed. It should also not be expected to remain the same between + * function calls. */ +char* +FormatEvents(uint32 events) +{ + static char return_str[8]; + + /* Helper variable to check if there's extra bits */ + uint32 all_flags = WL_LATCH_SET + | WL_SOCKET_READABLE + | WL_SOCKET_WRITEABLE + | WL_TIMEOUT + | WL_POSTMASTER_DEATH + | WL_EXIT_ON_PM_DEATH + | WL_SOCKET_CONNECTED; + + /* The formatting here isn't supposed to be *particularly* useful -- it's just to give an + * sense of what events have been triggered without needing to remember your powers of two. */ + + return_str[0] = (events & WL_LATCH_SET ) ? 'L' : '_'; + return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_'; + return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; + return_str[3] = (events & WL_TIMEOUT ) ? 'T' : '_'; + return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; + return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; + return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; + + if (events & (~all_flags)) + { + elog(WARNING, "Event formatting found unexpected component %d", + events & (~all_flags)); + return_str[6] = '*'; + return_str[7] = '\0'; + } + else + return_str[6] = '\0'; + + return (char *) &return_str; +} + +/* + * Convert a character which represents a hexadecimal digit to an integer. + * + * Returns -1 if the character is not a hexadecimal digit. + */ +static int +HexDecodeChar(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + + return -1; +} + +/* + * Decode a hex string into a byte string, 2 hex chars per byte. + * + * Returns false if invalid characters are encountered; otherwise true. + */ +bool +HexDecodeString(uint8 *result, char *input, int nbytes) +{ + int i; + + for (i = 0; i < nbytes; ++i) + { + int n1 = HexDecodeChar(input[i * 2]); + int n2 = HexDecodeChar(input[i * 2 + 1]); + + if (n1 < 0 || n2 < 0) + return false; + result[i] = n1 * 16 + n2; + } + + return true; +} + +/* -------------------------------- + * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint32 +pq_getmsgint32_le(StringInfo msg) +{ + uint32 n32; + + pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); + + return n32; +} + +/* -------------------------------- + * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order + * -------------------------------- + */ +uint64 +pq_getmsgint64_le(StringInfo msg) +{ + uint64 n64; + + pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); + + return n64; +} + +/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ +void +pq_sendint32_le(StringInfo buf, uint32 i) +{ + enlargeStringInfo(buf, sizeof(uint32)); + memcpy(buf->data + buf->len, &i, sizeof(uint32)); + buf->len += sizeof(uint32); +} + +/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ +void +pq_sendint64_le(StringInfo buf, uint64 i) +{ + enlargeStringInfo(buf, sizeof(uint64)); + memcpy(buf->data + buf->len, &i, sizeof(uint64)); + buf->len += sizeof(uint64); +} + +/* + * Write XLOG data to disk. + */ +void +XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) +{ + int startoff; + int byteswritten; + + while (nbytes > 0) + { + int segbytes; + + /* Close the current segment if it's completed */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + XLogWalPropClose(recptr); + + if (walpropFile < 0) + { + // FIXME Is it ok to use hardcoded value here? + TimeLineID tli = 1; + /* Create/use new log file */ + XLByteToSeg(recptr, walpropSegNo, wal_segment_size); + // FIXME Is it ok to call XLogFileInit here? + // In neon we want to open existing file. + walpropFile = XLogFileInit(walpropSegNo, tli); + walpropFileTLI = tli; + } + + /* Calculate the start offset of the received logs */ + startoff = XLogSegmentOffset(recptr, wal_segment_size); + + if (startoff + nbytes > wal_segment_size) + segbytes = wal_segment_size - startoff; + else + segbytes = nbytes; + + /* OK to write the logs */ + errno = 0; + + byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); + if (byteswritten <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + /* if write didn't set errno, assume no disk space */ + if (errno == 0) + errno = ENOSPC; + + save_errno = errno; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log segment %s " + "at offset %u, length %lu: %m", + xlogfname, startoff, (unsigned long) segbytes))); + } + + /* Update state for write */ + recptr += byteswritten; + + nbytes -= byteswritten; + buf += byteswritten; + } + + /* + * Close the current segment if it's fully written up in the last cycle of + * the loop. + */ + if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) + { + XLogWalPropClose(recptr); + } +} + +/* + * Close the current segment. + */ +void +XLogWalPropClose(XLogRecPtr recptr) +{ + Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); + + if (close(walpropFile) != 0) + { + char xlogfname[MAXFNAMELEN]; + XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); + + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close log segment %s: %m", + xlogfname))); + } + + walpropFile = -1; +} diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 9452932d590..5c53a3c1086 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -497,6 +497,13 @@ WalReceiverMain(void) if (endofwal) break; + /* + * Update WAL statistics, which are produced inside + * issue_xlog_fsync function. This is useful for counting + * WAL flushes, by querying pg_stat_wal. + */ + pgstat_report_wal(true); + /* * Ideally we would reuse a WaitEventSet object repeatedly * here to avoid the overheads of WaitLatchOrSocket on epoll diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 269914bce28..dd115365a13 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -54,6 +54,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xloginsert.h" #include "access/xlogreader.h" #include "access/xlogrecovery.h" #include "access/xlogutils.h" @@ -74,6 +75,7 @@ #include "replication/slot.h" #include "replication/snapbuild.h" #include "replication/syncrep.h" +#include "replication/walproposer.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "replication/walsender_private.h" @@ -236,10 +238,11 @@ static void IdentifySystem(void); static void ReadReplicationSlot(ReadReplicationSlotCmd *cmd); static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd); static void DropReplicationSlot(DropReplicationSlotCmd *cmd); -static void StartReplication(StartReplicationCmd *cmd); +void StartReplication(StartReplicationCmd *cmd); static void StartLogicalReplication(StartReplicationCmd *cmd); static void ProcessStandbyMessage(void); static void ProcessStandbyReplyMessage(void); +static void ProcessReplicationFeedbackMessage(void); static void ProcessStandbyHSFeedbackMessage(void); static void ProcessRepliesIfAny(void); static void ProcessPendingWrites(void); @@ -681,7 +684,7 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd) * At the moment, this never returns, but an ereport(ERROR) will take us back * to the main loop. */ -static void +void StartReplication(StartReplicationCmd *cmd) { StringInfoData buf; @@ -816,11 +819,14 @@ StartReplication(StartReplicationCmd *cmd) WalSndSetState(WALSNDSTATE_CATCHUP); /* Send a CopyBothResponse message, and start streaming */ - pq_beginmessage(&buf, 'W'); - pq_sendbyte(&buf, 0); - pq_sendint16(&buf, 0); - pq_endmessage(&buf); - pq_flush(); + if (!am_wal_proposer) + { + pq_beginmessage(&buf, 'W'); + pq_sendbyte(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage(&buf); + pq_flush(); + } /* * Don't allow a request to stream from a future point in WAL that @@ -1460,7 +1466,7 @@ ProcessPendingWrites(void) } /* Try to flush pending output to the client */ - if (pq_flush_if_writable() != 0) + if (!am_wal_proposer && pq_flush_if_writable() != 0) WalSndShutdown(); } @@ -1903,6 +1909,9 @@ ProcessRepliesIfAny(void) int r; bool received = false; + if (am_wal_proposer) + return; + last_processing = GetCurrentTimestamp(); /* @@ -2028,6 +2037,10 @@ ProcessStandbyMessage(void) ProcessStandbyHSFeedbackMessage(); break; + case 'z': + ProcessReplicationFeedbackMessage(); + break; + default: ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -2039,7 +2052,7 @@ ProcessStandbyMessage(void) /* * Remember that a walreceiver just confirmed receipt of lsn `lsn`. */ -static void +void PhysicalConfirmReceivedLocation(XLogRecPtr lsn) { bool changed = false; @@ -2078,21 +2091,63 @@ ProcessStandbyReplyMessage(void) flushPtr, applyPtr; bool replyRequested; - TimeOffset writeLag, - flushLag, - applyLag; - bool clearLagTimes; - TimestampTz now; TimestampTz replyTime; - static bool fullyAppliedLastTime = false; - /* the caller already consumed the msgtype byte */ writePtr = pq_getmsgint64(&reply_message); flushPtr = pq_getmsgint64(&reply_message); applyPtr = pq_getmsgint64(&reply_message); replyTime = pq_getmsgint64(&reply_message); replyRequested = pq_getmsgbyte(&reply_message); + ProcessStandbyReply(writePtr, + flushPtr, + applyPtr, + replyTime, + replyRequested); + + elog(LOG, "ProcessStandbyReplyMessage: writelsn %X/%X", + LSN_FORMAT_ARGS(writePtr)); + elog(LOG, "ProcessStandbyReplyMessage: flushlsn %X/%X", + LSN_FORMAT_ARGS(flushPtr)); + elog(LOG, "ProcessStandbyReplyMessage: applylsn %X/%X", + LSN_FORMAT_ARGS(applyPtr)); +} + +// This message is a neon extension of postgres replication protocol +static void +ProcessReplicationFeedbackMessage(void) +{ + ReplicationFeedback rf; + + // consume message length + pq_getmsgint64(&reply_message); + + ParseReplicationFeedbackMessage(&reply_message, &rf); + + replication_feedback_set(&rf); + + SetZenithCurrentClusterSize(rf.currentClusterSize); + + ProcessStandbyReply(rf.ps_writelsn, + rf.ps_flushlsn, + rf.ps_applylsn, + rf.ps_replytime, + false); +} + +void +ProcessStandbyReply(XLogRecPtr writePtr, + XLogRecPtr flushPtr, + XLogRecPtr applyPtr, + TimestampTz replyTime, + bool replyRequested) +{ + TimeOffset writeLag, + flushLag, + applyLag; + bool clearLagTimes; + TimestampTz now; + static bool fullyAppliedLastTime = false; if (message_level_is_interesting(DEBUG2)) { @@ -2163,6 +2218,13 @@ ProcessStandbyReplyMessage(void) if (!am_cascading_walsender) SyncRepReleaseWaiters(); + /* + * walproposer use trunclateLsn instead of flushPtr for confirmed + * received location, so we shouldn't update restart_lsn here. + */ + if (am_wal_proposer) + return; + /* * Advance our local xmin horizon when the client confirmed a flush. */ @@ -2275,7 +2337,16 @@ ProcessStandbyHSFeedbackMessage(void) feedbackEpoch = pq_getmsgint(&reply_message, 4); feedbackCatalogXmin = pq_getmsgint(&reply_message, 4); feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4); + ProcessStandbyHSFeedback(replyTime, feedbackXmin, feedbackEpoch, feedbackCatalogXmin, feedbackCatalogEpoch); +} +void +ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch) +{ if (message_level_is_interesting(DEBUG2)) { char *replyTimeStr; @@ -2483,6 +2554,19 @@ WalSndLoop(WalSndSendDataCallback send_data) /* Check for input from the client */ ProcessRepliesIfAny(); + if (am_wal_proposer) + { + send_data(); + if (WalSndCaughtUp) + { + if (MyWalSnd->state == WALSNDSTATE_CATCHUP) + WalSndSetState(WALSNDSTATE_STREAMING); + WalProposerPoll(); + WalSndCaughtUp = false; + } + continue; + } + /* * If we have received CopyDone from the client, sent CopyDone * ourselves, and the output buffer is empty, it's time to exit @@ -2846,7 +2930,7 @@ XLogSendPhysical(void) * * In theory we could make XLogFlush() record a time in shmem whenever WAL * is flushed and we could get that time as well as the LSN when we call - * GetFlushRecPtr() above (and likewise for the cascading standby + * GetFlushRecPtr(NULL) above (and likewise for the cascading standby * equivalent), but rather than putting any new code into the hot WAL path * it seems good enough to capture the time here. We should reach this * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that @@ -2940,74 +3024,83 @@ XLogSendPhysical(void) nbytes = endptr - startptr; Assert(nbytes <= MAX_SEND_SIZE); - /* - * OK to read and send the slice. - */ - resetStringInfo(&output_message); - pq_sendbyte(&output_message, 'w'); - - pq_sendint64(&output_message, startptr); /* dataStart */ - pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ - pq_sendint64(&output_message, 0); /* sendtime, filled in last */ - - /* - * Read the log directly into the output buffer to avoid extra memcpy - * calls. - */ - enlargeStringInfo(&output_message, nbytes); + if (am_wal_proposer) + { + WalProposerBroadcast(startptr, endptr); + } + else + { + /* + * OK to read and send the slice. + */ + if (output_message.data) + resetStringInfo(&output_message); + else + initStringInfo(&output_message); -retry: - if (!WALRead(xlogreader, - &output_message.data[output_message.len], - startptr, - nbytes, - xlogreader->seg.ws_tli, /* Pass the current TLI because - * only WalSndSegmentOpen controls - * whether new TLI is needed. */ - &errinfo)) - WALReadRaiseError(&errinfo); + pq_sendbyte(&output_message, 'w'); + pq_sendint64(&output_message, startptr); /* dataStart */ + pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ + pq_sendint64(&output_message, 0); /* sendtime, filled in last */ - /* See logical_read_xlog_page(). */ - XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize); - CheckXLogRemoved(segno, xlogreader->seg.ws_tli); + /* + * Read the log directly into the output buffer to avoid extra memcpy + * calls. + */ + enlargeStringInfo(&output_message, nbytes); + + retry: + if (!WALRead(xlogreader, + &output_message.data[output_message.len], + startptr, + nbytes, + xlogreader->seg.ws_tli, /* Pass the current TLI because + * only WalSndSegmentOpen controls + * whether new TLI is needed. */ + &errinfo)) + WALReadRaiseError(&errinfo); + + /* See logical_read_xlog_page(). */ + XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize); + CheckXLogRemoved(segno, xlogreader->seg.ws_tli); - /* - * During recovery, the currently-open WAL file might be replaced with the - * file of the same name retrieved from archive. So we always need to - * check what we read was valid after reading into the buffer. If it's - * invalid, we try to open and read the file again. - */ - if (am_cascading_walsender) - { - WalSnd *walsnd = MyWalSnd; - bool reload; + /* + * During recovery, the currently-open WAL file might be replaced with the + * file of the same name retrieved from archive. So we always need to + * check what we read was valid after reading into the buffer. If it's + * invalid, we try to open and read the file again. + */ + if (am_cascading_walsender) + { + WalSnd *walsnd = MyWalSnd; + bool reload; - SpinLockAcquire(&walsnd->mutex); - reload = walsnd->needreload; - walsnd->needreload = false; - SpinLockRelease(&walsnd->mutex); + SpinLockAcquire(&walsnd->mutex); + reload = walsnd->needreload; + walsnd->needreload = false; + SpinLockRelease(&walsnd->mutex); - if (reload && xlogreader->seg.ws_file >= 0) - { - wal_segment_close(xlogreader); + if (reload && xlogreader->seg.ws_file >= 0) + { + wal_segment_close(xlogreader); - goto retry; + goto retry; + } } - } - output_message.len += nbytes; - output_message.data[output_message.len] = '\0'; + output_message.len += nbytes; + output_message.data[output_message.len] = '\0'; - /* - * Fill the send timestamp last, so that it is taken as late as possible. - */ - resetStringInfo(&tmpbuf); - pq_sendint64(&tmpbuf, GetCurrentTimestamp()); - memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], - tmpbuf.data, sizeof(int64)); - - pq_putmessage_noblock('d', output_message.data, output_message.len); + /* + * Fill the send timestamp last, so that it is taken as late as possible. + */ + resetStringInfo(&tmpbuf); + pq_sendint64(&tmpbuf, GetCurrentTimestamp()); + memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], + tmpbuf.data, sizeof(int64)); + pq_putmessage_noblock('d', output_message.data, output_message.len); + } sentPtr = endptr; /* Update shared memory status */ @@ -3042,7 +3135,7 @@ XLogSendLogical(void) /* * We'll use the current flush point to determine whether we've caught up. * This variable is static in order to cache it across calls. Caching is - * helpful because GetFlushRecPtr() needs to acquire a heavily-contended + * helpful because GetFlushRecPtr(NULL) needs to acquire a heavily-contended * spinlock. */ static XLogRecPtr flushPtr = InvalidXLogRecPtr; @@ -3127,8 +3220,8 @@ WalSndDone(WalSndSendDataCallback send_data) * flush location if valid, write otherwise. Tools like pg_receivewal will * usually (unless in synchronous mode) return an invalid flush location. */ - replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ? - MyWalSnd->write : MyWalSnd->flush; + // XXX Zenith uses flush_lsn to pass extra payload, so use write_lsn here + replicatedPtr = MyWalSnd->write; if (WalSndCaughtUp && sentPtr == replicatedPtr && !pq_is_send_pending()) @@ -3862,3 +3955,79 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now) Assert(time != 0); return now - time; } + +/* + * Get minimal write and flush LSN among all live replicas + */ +void +GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply_lsn) +{ + XLogRecPtr min_write_lsn = UnknownXLogRecPtr; + XLogRecPtr min_flush_lsn = UnknownXLogRecPtr; + XLogRecPtr min_apply_lsn = UnknownXLogRecPtr; + for (int i = 0; i < max_wal_senders; i++) + { + WalSnd *walsnd = &WalSndCtl->walsnds[i]; + if (walsnd->state == WALSNDSTATE_STREAMING) + { + /* + * We assume that reads from walsnd->write/flush are atomic + * on all modern x64 systems, as these fields are uint64 and + * should be 8-bytes aligned. + */ + XLogRecPtr written = walsnd->write; + XLogRecPtr flushed = walsnd->flush; + XLogRecPtr applied = walsnd->apply; + min_write_lsn = Min(written, min_write_lsn); + min_flush_lsn = Min(flushed, min_flush_lsn); + min_apply_lsn = Min(applied, min_apply_lsn); + } + } + *write_lsn = min_write_lsn; + *flush_lsn = min_flush_lsn; + *apply_lsn = min_apply_lsn; +} + +// Check if we need to suspend inserts because of lagging replication. +uint64 +backpressure_lag(void) +{ + if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) + { + XLogRecPtr writePtr; + XLogRecPtr flushPtr; + XLogRecPtr applyPtr; + XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); + + replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); + #define MB ((XLogRecPtr)1024*1024) + + elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + LSN_FORMAT_ARGS(myFlushLsn), + LSN_FORMAT_ARGS(writePtr), + LSN_FORMAT_ARGS(flushPtr), + LSN_FORMAT_ARGS(applyPtr)); + + if ((writePtr != UnknownXLogRecPtr + && max_replication_write_lag > 0 + && myFlushLsn > writePtr + max_replication_write_lag*MB)) + { + return (myFlushLsn - writePtr - max_replication_write_lag*MB); + } + + if ((flushPtr != UnknownXLogRecPtr + && max_replication_flush_lag > 0 + && myFlushLsn > flushPtr + max_replication_flush_lag*MB)) + { + return (myFlushLsn - flushPtr - max_replication_flush_lag*MB); + } + + if ((applyPtr != UnknownXLogRecPtr + && max_replication_apply_lag > 0 + && myFlushLsn > applyPtr + max_replication_apply_lag*MB)) + { + return (myFlushLsn - applyPtr - max_replication_apply_lag*MB); + } + } + return 0; +} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9fcb3d6e194..3f5ba5942e2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -56,7 +56,7 @@ #include "utils/rel.h" #include "utils/resowner_private.h" #include "utils/timestamp.h" - +#include "replication/walsender.h" /* Note: these two macros only work on shared buffers, not local ones! */ #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) @@ -159,6 +159,9 @@ int checkpoint_flush_after = 0; int bgwriter_flush_after = 0; int backend_flush_after = 0; +/* Evict unpinned pages (for better test coverage) */ +bool zenith_test_evict = false; + /* local state for StartBufferIO and related functions */ static BufferDesc *InProgressBuf = NULL; static bool IsForInput; @@ -802,14 +805,13 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, { bool hit; - SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + SMgrRelation smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, forkNum, blockNum, mode, strategy, &hit); } - /* * ReadBuffer_common -- common logic for all ReadBuffer variants * @@ -824,7 +826,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, Block bufBlock; bool found; bool isExtend; - bool isLocalBuf = SmgrIsTemp(smgr); + /* + * wal_redo postgres is working in single user mode, we do not need to synchronize access to shared buffer, + * so let's use local buffers instead + */ + bool isLocalBuf = SmgrIsTemp(smgr) || am_wal_redo_postgres; *hit = false; @@ -934,11 +940,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); if (!PageIsNew((Page) bufBlock)) - ereport(ERROR, + { + // XXX-ZENITH + MemSet((char *) bufBlock, 0, BLCKSZ); + ereport(DEBUG1, (errmsg("unexpected data beyond EOF in block %u of relation %s", blockNum, relpath(smgr->smgr_rnode, forkNum)), errhint("This has been seen to occur with buggy kernels; consider updating your system."))); - + } /* * We *must* do smgrextend before succeeding, else the page will not * be reserved by the kernel, and the next P_NEW call will decide to @@ -1927,6 +1936,32 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner) UnlockBufHdr(buf, buf_state); } ForgetPrivateRefCountEntry(ref); + + if (zenith_test_evict && !InRecovery) + { + buf_state = LockBufHdr(buf); + if (BUF_STATE_GET_REFCOUNT(buf_state) == 0) + { + if (buf_state & BM_DIRTY) + { + ReservePrivateRefCountEntry(); + PinBuffer_Locked(buf); + if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_SHARED)) + { + FlushOneBuffer(b); + LWLockRelease(BufferDescriptorGetContentLock(buf)); + } + UnpinBuffer(buf, true); + } + else + { + InvalidateBuffer(buf); + } + } + else + UnlockBufHdr(buf, buf_state); + } } } @@ -2848,7 +2883,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) /* Find smgr relation for buffer */ if (reln == NULL) - reln = smgropen(buf->tag.rnode, InvalidBackendId); + reln = smgropen(buf->tag.rnode, InvalidBackendId, 0); TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum, buf->tag.blockNum, @@ -4992,7 +5027,7 @@ IssuePendingWritebacks(WritebackContext *context) i += ahead; /* and finally tell the kernel to write the data to storage */ - reln = smgropen(tag.rnode, InvalidBackendId); + reln = smgropen(tag.rnode, InvalidBackendId, 0); smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks); } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index e71f95ac1ff..b99ed777b68 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -18,12 +18,15 @@ #include "access/parallel.h" #include "catalog/catalog.h" #include "executor/instrument.h" +#include "miscadmin.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner_private.h" +/* ZENITH: prevent eviction of the buffer of target page */ +extern Buffer wal_redo_buffer; /*#define LBDEBUG*/ @@ -182,6 +185,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, if (LocalRefCount[b] == 0) { + if (-b - 1 == wal_redo_buffer) + { + /* ZENITH: Prevent eviction of the buffer with target wal redo page */ + continue; + } + buf_state = pg_atomic_read_u32(&bufHdr->state); if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0) @@ -215,7 +224,10 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); /* Find smgr relation for buffer */ - oreln = smgropen(bufHdr->tag.rnode, MyBackendId); + if (am_wal_redo_postgres && MyBackendId == InvalidBackendId) + oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_PERMANENT); + else + oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_TEMP); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index d41ae37090a..a3c6a55f5d7 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -649,10 +649,18 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) /* Extend as needed. */ while (fsm_nblocks_now < fsm_nblocks) { - PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now); + /* + * ZENITH: Initialize FSM pages through buffer cache to prevent loading + * them from pageserver. + */ + Buffer buffer = ReadBufferExtended(rel, FSM_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL); + Page page = BufferGetPage(buffer); + + PageInit((Page) page, BLCKSZ, 0); + PageSetChecksumInplace(page, fsm_nblocks_now); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); - smgrextend(reln, FSM_FORKNUM, fsm_nblocks_now, - pg.data, false); fsm_nblocks_now++; } diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 1a6f5270518..54bb3cdbcae 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -36,6 +36,7 @@ #include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/walsender.h" +#include "replication/walproposer.h" #include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/ipc.h" @@ -141,6 +142,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); size = add_size(size, StatsShmemSize()); + size = add_size(size, WalproposerShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -294,6 +296,8 @@ CreateSharedMemoryAndSemaphores(void) AsyncShmemInit(); StatsShmemInit(); + WalproposerShmemInit(); + #ifdef EXEC_BACKEND /* diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index a3d367db511..da63605e9e1 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -427,7 +427,6 @@ PageRestoreTempPage(Page tempPage, Page oldPage) pageSize = PageGetPageSize(tempPage); memcpy((char *) oldPage, (char *) tempPage, pageSize); - pfree(tempPage); } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index f14c48da6cf..002eb6fbcd8 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1096,7 +1096,7 @@ DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo) srels = palloc(sizeof(SMgrRelation) * ndelrels); for (i = 0; i < ndelrels; i++) { - SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); + SMgrRelation srel = smgropen(delrels[i], InvalidBackendId, 0); if (isRedo) { @@ -1379,7 +1379,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) int mdsyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId); + SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 0); File file; bool need_to_close; int result, diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index a477f70f0e3..87260673bc8 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -18,6 +18,7 @@ #include "postgres.h" #include "access/xlogutils.h" +#include "catalog/pg_tablespace.h" #include "lib/ilist.h" #include "storage/bufmgr.h" #include "storage/ipc.h" @@ -26,47 +27,8 @@ #include "utils/hsearch.h" #include "utils/inval.h" - -/* - * This struct of function pointers defines the API between smgr.c and - * any individual storage manager module. Note that smgr subfunctions are - * generally expected to report problems via elog(ERROR). An exception is - * that smgr_unlink should use elog(WARNING), rather than erroring out, - * because we normally unlink relations during post-commit/abort cleanup, - * and so it's too late to raise an error. Also, various conditions that - * would normally be errors should be allowed during bootstrap and/or WAL - * recovery --- see comments in md.c for details. - */ -typedef struct f_smgr -{ - void (*smgr_init) (void); /* may be NULL */ - void (*smgr_shutdown) (void); /* may be NULL */ - void (*smgr_open) (SMgrRelation reln); - void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, - bool isRedo); - bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum, - bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); - bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); - void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer); - void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); - void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); - BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); - void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); -} f_smgr; - -static const f_smgr smgrsw[] = { +static const f_smgr smgr_md = { /* magnetic disk */ - { .smgr_init = mdinit, .smgr_shutdown = NULL, .smgr_open = mdopen, @@ -82,11 +44,8 @@ static const f_smgr smgrsw[] = { .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, .smgr_immedsync = mdimmedsync, - } }; -static const int NSmgr = lengthof(smgrsw); - /* * Each backend has a hashtable that stores all extant SMgrRelation objects. * In addition, "unowned" SMgrRelation objects are chained together in a list. @@ -96,7 +55,7 @@ static HTAB *SMgrRelationHash = NULL; static dlist_head unowned_relns; /* local function prototypes */ -static void smgrshutdown(int code, Datum arg); +//static void smgrshutdown(int code, Datum arg); /* @@ -110,40 +69,80 @@ static void smgrshutdown(int code, Datum arg); void smgrinit(void) { - int i; + (*smgr_init_hook)(); - for (i = 0; i < NSmgr; i++) - { - if (smgrsw[i].smgr_init) - smgrsw[i].smgr_init(); - } + /* + * ZENITH XXX + * This doesn't work with inmem_smgr, so temporarily disable. + * Anyway, we don't have any real smgrshutdown function. + */ + // /* register the shutdown proc */ + // on_proc_exit(smgrshutdown, 0); +} - /* register the shutdown proc */ - on_proc_exit(smgrshutdown, 0); +//ZENITH XXX See comment above. Silence compiler warning. +// /* +// * on_proc_exit hook for smgr cleanup during backend shutdown +// */ +// static void +// smgrshutdown(int code, Datum arg) +// { +// if (smgr_shutdown_hook) +// (*smgr_shutdown_hook)(); + +// smgr_shutdown_standard(); +// } + +/* Hook for plugins to get control in smgr */ +smgr_hook_type smgr_hook = NULL; +smgr_init_hook_type smgr_init_hook = smgr_init_standard; +smgr_shutdown_hook_type smgr_shutdown_hook = NULL; + +const f_smgr * +smgr_standard(BackendId backend, RelFileNode rnode) +{ + return &smgr_md; } -/* - * on_proc_exit hook for smgr cleanup during backend shutdown - */ -static void -smgrshutdown(int code, Datum arg) +void +smgr_init_standard(void) { - int i; + mdinit(); +} - for (i = 0; i < NSmgr; i++) +void +smgr_shutdown_standard(void) +{ +} + +const f_smgr * +smgr(BackendId backend, RelFileNode rnode) +{ + const f_smgr *result; + + if (smgr_hook) { - if (smgrsw[i].smgr_shutdown) - smgrsw[i].smgr_shutdown(); + result = (*smgr_hook)(backend, rnode); } + else + result = smgr_standard(backend, rnode); + + return result; } + /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * * This does not attempt to actually open the underlying file. + * + * The caller should pass the value of pg_class.relpersistence, if they know + * it, or 0 if unknown. Some operations, like smgrwrite() and smgrunlink() + * are allowed when relpersistence is not known, but others like smgrread() + * require it. */ SMgrRelation -smgropen(RelFileNode rnode, BackendId backend) +smgropen(RelFileNode rnode, BackendId backend, char relpersistence) { RelFileNodeBackend brnode; SMgrRelation reln; @@ -174,16 +173,33 @@ smgropen(RelFileNode rnode, BackendId backend) /* hash_search already filled in the lookup key */ reln->smgr_owner = NULL; reln->smgr_targblock = InvalidBlockNumber; + reln->smgr_relpersistence = relpersistence; for (int i = 0; i <= MAX_FORKNUM; ++i) reln->smgr_cached_nblocks[i] = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + + reln->smgr = smgr(backend, rnode); /* implementation-specific initialization */ - smgrsw[reln->smgr_which].smgr_open(reln); + (*reln->smgr).smgr_open(reln); /* it has no owner yet */ dlist_push_tail(&unowned_relns, &reln->node); } + else + { + /* + * If the caller passed a valid 'relpersistence', and it was unknown + * before, update it. + */ + if (reln->smgr_relpersistence == 0) + reln->smgr_relpersistence = relpersistence; + else + { + if (!(relpersistence == 0 || reln->smgr_relpersistence == relpersistence)) + elog(ERROR, "relpersistence mismatch: smgropen %c vs SmgrRelation %c", + relpersistence, reln->smgr_relpersistence); + } + } return reln; } @@ -246,7 +262,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln) bool smgrexists(SMgrRelation reln, ForkNumber forknum) { - return smgrsw[reln->smgr_which].smgr_exists(reln, forknum); + return (*reln->smgr).smgr_exists(reln, forknum); } /* @@ -259,7 +275,7 @@ smgrclose(SMgrRelation reln) ForkNumber forknum; for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + (*reln->smgr).smgr_close(reln, forknum); owner = reln->smgr_owner; @@ -289,7 +305,7 @@ smgrrelease(SMgrRelation reln) { for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + (*reln->smgr).smgr_close(reln, forknum); reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; } } @@ -368,7 +384,7 @@ smgrclosenode(RelFileNodeBackend rnode) void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) { - smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo); + (*reln->smgr).smgr_create(reln, forknum, isRedo); } /* @@ -396,12 +412,10 @@ smgrdosyncall(SMgrRelation *rels, int nrels) */ for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - if (smgrsw[which].smgr_exists(rels[i], forknum)) - smgrsw[which].smgr_immedsync(rels[i], forknum); + if ((*rels[i]->smgr).smgr_exists(rels[i], forknum)) + (*rels[i]->smgr).smgr_immedsync(rels[i], forknum); } } } @@ -440,13 +454,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { RelFileNodeBackend rnode = rels[i]->smgr_rnode; - int which = rels[i]->smgr_which; rnodes[i] = rnode; /* Close the forks at smgr level */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_close(rels[i], forknum); + (*rels[i]->smgr).smgr_close(rels[i], forknum); } /* @@ -470,10 +483,8 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo); + (*rels[i]->smgr).smgr_unlink(rnodes[i], forknum, isRedo); } pfree(rnodes); @@ -493,7 +504,7 @@ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, + (*reln->smgr).smgr_extend(reln, forknum, blocknum, buffer, skipFsync); /* @@ -517,7 +528,7 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); + return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum); } /* @@ -532,7 +543,7 @@ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { - smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); + (*reln->smgr).smgr_read(reln, forknum, blocknum, buffer); } /* @@ -554,7 +565,7 @@ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, + (*reln->smgr).smgr_write(reln, forknum, blocknum, buffer, skipFsync); } @@ -567,7 +578,7 @@ void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { - smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum, + (*reln->smgr).smgr_writeback(reln, forknum, blocknum, nblocks); } @@ -585,7 +596,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) if (result != InvalidBlockNumber) return result; - result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum); + result = (*reln->smgr).smgr_nblocks(reln, forknum); reln->smgr_cached_nblocks[forknum] = result; @@ -651,7 +662,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb /* Make the cached size is invalid if we encounter an error. */ reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + (*reln->smgr).smgr_truncate(reln, forknum[i], nblocks[i]); /* * We might as well update the local smgr_cached_nblocks values. The @@ -690,7 +701,31 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb void smgrimmedsync(SMgrRelation reln, ForkNumber forknum) { - smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); + (*reln->smgr).smgr_immedsync(reln, forknum); +} + +/* + * Zenith-added functions to mark the phases of an unlogged index build. + */ +void +smgr_start_unlogged_build(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_start_unlogged_build) + (*reln->smgr).smgr_start_unlogged_build(reln); +} + +void +smgr_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_finish_unlogged_build_phase_1) + (*reln->smgr).smgr_finish_unlogged_build_phase_1(reln); +} + +void +smgr_end_unlogged_build(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_end_unlogged_build) + (*reln->smgr).smgr_end_unlogged_build(reln); } /* diff --git a/src/backend/tcop/Makefile b/src/backend/tcop/Makefile index f662a7dd1cf..84f027436a4 100644 --- a/src/backend/tcop/Makefile +++ b/src/backend/tcop/Makefile @@ -20,4 +20,6 @@ OBJS = \ pquery.o \ utility.o +OBJS += zenith_wal_redo.o + include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 66294ab4c8b..20e4d4b0a85 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3152,8 +3152,8 @@ RecoveryConflictInterrupt(ProcSignalReason reason) * return; another interrupt could have arrived. But we promise that * any pre-existing one will have been serviced.) */ -void -ProcessInterrupts(void) +static void +ProcessInterrupts_pg(void) { /* OK to accept any interrupts now? */ if (InterruptHoldoffCount != 0 || CritSectionCount != 0) @@ -3406,6 +3406,38 @@ ProcessInterrupts(void) ProcessLogMemoryContextInterrupt(); } +void +ProcessInterrupts(void) +{ + uint64 lag; + + if (InterruptHoldoffCount != 0 || CritSectionCount != 0) + return; + + // Don't throttle read only transactions and wal sender + if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + { + ProcessInterrupts_pg(); + return; + } + + #define BACK_PRESSURE_DELAY 10000L // 0.01 sec + while(true) + { + ProcessInterrupts_pg(); + + // Suspend writers until replicas catch up + lag = backpressure_lag(); + if (lag <= 0) + break; + + set_ps_display("backpressure throttling"); + + elog(DEBUG2, "backpressure throttling: lag %lu", lag); + pg_usleep(BACK_PRESSURE_DELAY); + } +} + /* * IA64-specific code to fetch the AR.BSP register for stack depth checks. diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c new file mode 100644 index 00000000000..02ca133d37a --- /dev/null +++ b/src/backend/tcop/zenith_wal_redo.c @@ -0,0 +1,813 @@ +/*------------------------------------------------------------------------- + * + * zenith_wal_redo.c + * Entry point for WAL redo helper + * + * + * This file contains an alternative main() function for the 'postgres' + * binary. In the special mode, we go into a special mode that's similar + * to the single user mode. We don't launch postmaster or any auxiliary + * processes. Instead, we wait for command from 'stdin', and respond to + * 'stdout'. + * + * The protocol through stdin/stdout is loosely based on the libpq protocol. + * The process accepts messages through stdin, and each message has the format: + * + * char msgtype; + * int32 length; // length of message including 'length' but excluding + * // 'msgtype', in network byte order + * + * + * There are three message types: + * + * BeginRedoForBlock ('B'): Prepare for WAL replay for given block + * PushPage ('P'): Copy a page image (in the payload) to buffer cache + * ApplyRecord ('A'): Apply a WAL record (in the payload) + * GetPage ('G'): Return a page image from buffer cache. + * + * Currently, you only get a response to GetPage requests; the response is + * simply a 8k page, without any headers. Errors are logged to stderr. + * + * FIXME: + * - this currently requires a valid PGDATA, and creates a lock file there + * like a normal postmaster. There's no fundamental reason for that, though. + * - should have EndRedoForBlock, and flush page cache, to allow using this + * mechanism for more than one block without restarting the process. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/tcop/zenith_wal_redo.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#include +#include +#ifdef HAVE_SYS_SELECT_H +#include +#endif +#ifdef HAVE_SYS_RESOURCE_H +#include +#include +#endif + +#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__) +#define MALLOC_NO_MMAP +#include +#endif + +#ifndef HAVE_GETRUSAGE +#include "rusagestub.h" +#endif + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "catalog/pg_class.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "postmaster/postmaster.h" +#include "postmaster/seccomp.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" + +static int ReadRedoCommand(StringInfo inBuf); +static void BeginRedoForBlock(StringInfo input_message); +static void PushPage(StringInfo input_message); +static void ApplyRecord(StringInfo input_message); +static void apply_error_callback(void *arg); +static bool redo_block_filter(XLogReaderState *record, uint8 block_id); +static void GetPage(StringInfo input_message); +static ssize_t buffered_read(void *buf, size_t count); + +static BufferTag target_redo_tag; + +Buffer wal_redo_buffer; +bool am_wal_redo_postgres; + +static XLogReaderState *reader_state; + +#define TRACE DEBUG5 + +#ifdef HAVE_LIBSECCOMP +static void +enter_seccomp_mode(void) +{ + PgSeccompRule syscalls[] = + { + /* Hard requirements */ + PG_SCMP_ALLOW(exit_group), + PG_SCMP_ALLOW(pselect6), + PG_SCMP_ALLOW(read), + PG_SCMP_ALLOW(select), + PG_SCMP_ALLOW(write), + + /* Memory allocation */ + PG_SCMP_ALLOW(brk), +#ifndef MALLOC_NO_MMAP + /* TODO: musl doesn't have mallopt */ + PG_SCMP_ALLOW(mmap), + PG_SCMP_ALLOW(munmap), +#endif + /* + * getpid() is called on assertion failure, in ExceptionalCondition. + * It's not really needed, but seems pointless to hide it either. The + * system call unlikely to expose a kernel vulnerability, and the PID + * is stored in MyProcPid anyway. + */ + PG_SCMP_ALLOW(getpid), + + /* Enable those for a proper shutdown. + PG_SCMP_ALLOW(munmap), + PG_SCMP_ALLOW(shmctl), + PG_SCMP_ALLOW(shmdt), + PG_SCMP_ALLOW(unlink), // shm_unlink + */ + }; + +#ifdef MALLOC_NO_MMAP + /* Ask glibc not to use mmap() */ + mallopt(M_MMAP_MAX, 0); +#endif + + seccomp_load_rules(syscalls, lengthof(syscalls)); +} +#endif + +/* ---------------------------------------------------------------- + * FIXME comment + * PostgresMain + * postgres main loop -- all backends, interactive or otherwise start here + * + * argc/argv are the command line arguments to be used. (When being forked + * by the postmaster, these are not the original argv array of the process.) + * dbname is the name of the database to connect to, or NULL if the database + * name should be extracted from the command line arguments or defaulted. + * username is the PostgreSQL user name to be used for the session. + * ---------------------------------------------------------------- + */ +void +WalRedoMain(int argc, char *argv[], + const char *dbname, + const char *username) +{ + int firstchar; + StringInfoData input_message; +#ifdef HAVE_LIBSECCOMP + bool enable_seccomp; +#endif + + /* Initialize startup process environment if necessary. */ + InitStandaloneProcess(argv[0]); + + am_wal_redo_postgres = true; + + /* + * Set default values for command-line options. + */ + InitializeGUCOptions(); + + /* + * WAL redo does not need a large number of buffers. And speed of + * DropRelFileNodeAllLocalBuffers() is proportional to the number of + * buffers. So let's keep it small (default value is 1024) + */ + num_temp_buffers = 4; + + /* + * Parse command-line options. + * TODO + */ + //process_postgres_switches(argc, argv, PGC_POSTMASTER, &dbname); + + /* Acquire configuration parameters */ + if (!SelectConfigFiles(NULL, progname)) + proc_exit(1); + + /* + * Validate we have been given a reasonable-looking DataDir and change into it. + */ + checkDataDir(); + ChangeToDataDir(); + + /* + * Create lockfile for data directory. + */ + CreateDataDirLockFile(false); + + /* read control file (error checking and contains config ) */ + LocalProcessControlFile(false); + + /* + * process any libraries that should be preloaded at postmaster start + */ + process_shared_preload_libraries(); + + /* Initialize MaxBackends (if under postmaster, was done already) */ + InitializeMaxBackends(); + + /* + * Give preloaded libraries a chance to request additional shared memory. + */ + process_shmem_requests(); + + /* + * Now that loadable modules have had their chance to request additional + * shared memory, determine the value of any runtime-computed GUCs that + * depend on the amount of shared memory required. + */ + InitializeShmemGUCs(); + + /* + * Now that modules have been loaded, we can process any custom resource + * managers specified in the wal_consistency_checking GUC. + */ + InitializeWalConsistencyChecking(); + + CreateSharedMemoryAndSemaphores(); + + /* + * Remember stand-alone backend startup time,roughly at the same point + * during startup that postmaster does so. + */ + PgStartTime = GetCurrentTimestamp(); + + /* + * Create a per-backend PGPROC struct in shared memory. We must do this + * before we can use LWLocks. + */ + InitProcess(); + + SetProcessingMode(InitProcessing); + + /* Early initialization */ + BaseInit(); + + SetProcessingMode(NormalProcessing); + + /* Redo routines won't work if we're not "in recovery" */ + InRecovery = true; + + /* + * Create the memory context we will use in the main loop. + * + * MessageContext is reset once per iteration of the main loop, ie, upon + * completion of processing of each command message from the client. + */ + MessageContext = AllocSetContextCreate(TopMemoryContext, + "MessageContext", + ALLOCSET_DEFAULT_SIZES); + + /* we need a ResourceOwner to hold buffer pins */ + Assert(CurrentResourceOwner == NULL); + CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo"); + + /* Initialize resource managers */ + for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_startup != NULL) + RmgrTable[rmid].rm_startup(); + } + reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL); + +#ifdef HAVE_LIBSECCOMP + /* We prefer opt-out to opt-in for greater security */ + enable_seccomp = true; + for (int i = 1; i < argc; i++) + if (strcmp(argv[i], "--disable-seccomp") == 0) + enable_seccomp = false; + + /* + * We deliberately delay the transition to the seccomp mode + * until it's time to enter the main processing loop; + * else we'd have to add a lot more syscalls to the allowlist. + */ + if (enable_seccomp) + enter_seccomp_mode(); +#endif + + /* + * Main processing loop + */ + MemoryContextSwitchTo(MessageContext); + initStringInfo(&input_message); + + for (;;) + { + /* Release memory left over from prior query cycle. */ + resetStringInfo(&input_message); + + set_ps_display("idle"); + + /* + * (3) read a command (loop blocks here) + */ + firstchar = ReadRedoCommand(&input_message); + switch (firstchar) + { + case 'B': /* BeginRedoForBlock */ + BeginRedoForBlock(&input_message); + break; + + case 'P': /* PushPage */ + PushPage(&input_message); + break; + + case 'A': /* ApplyRecord */ + ApplyRecord(&input_message); + break; + + case 'G': /* GetPage */ + GetPage(&input_message); + break; + + /* + * EOF means we're done. Perform normal shutdown. + */ + case EOF: + ereport(LOG, + (errmsg("received EOF on stdin, shutting down"))); + +#ifdef HAVE_LIBSECCOMP + /* + * Skip the shutdown sequence, leaving some garbage behind. + * Hopefully, postgres will clean it up in the next run. + * This way we don't have to enable extra syscalls, which is nice. + * See enter_seccomp_mode() above. + */ + if (enable_seccomp) + _exit(0); +#endif + /* + * NOTE: if you are tempted to add more code here, DON'T! + * Whatever you had in mind to do should be set up as an + * on_proc_exit or on_shmem_exit callback, instead. Otherwise + * it will fail to be called during other backend-shutdown + * scenarios. + */ + proc_exit(0); + + default: + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid frontend message type %d", + firstchar))); + } + } /* end of input-reading loop */ +} + +/* + * Some debug function that may be handy for now. + */ +pg_attribute_unused() +static char * +pprint_buffer(char *data, int len) +{ + StringInfoData s; + initStringInfo(&s); + appendStringInfo(&s, "\n"); + for (int i = 0; i < len; i++) { + + appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) ); + if (i % 32 == 31) { + appendStringInfo(&s, "\n"); + } + } + appendStringInfo(&s, "\n"); + + return s.data; +} + +/* ---------------------------------------------------------------- + * routines to obtain user input + * ---------------------------------------------------------------- + */ + +/* + * Read next command from the client. + * + * the string entered by the user is placed in its parameter inBuf, + * and we act like a Q message was received. + * + * EOF is returned if end-of-file input is seen; time to shut down. + * ---------------- + */ +static int +ReadRedoCommand(StringInfo inBuf) +{ + ssize_t ret; + char hdr[1 + sizeof(int32)]; + int qtype; + int32 len; + + /* Read message type and message length */ + ret = buffered_read(hdr, sizeof(hdr)); + if (ret != sizeof(hdr)) + { + if (ret == 0) + return EOF; + else if (ret < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not read message header: %m"))); + else + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF"))); + } + + qtype = hdr[0]; + memcpy(&len, &hdr[1], sizeof(int32)); + len = pg_ntoh32(len); + + if (len < 4) + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid message length"))); + + len -= 4; /* discount length itself */ + + /* Read the message payload */ + enlargeStringInfo(inBuf, len); + ret = buffered_read(inBuf->data, len); + if (ret != len) + { + if (ret < 0) + ereport(ERROR, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("could not read message: %m"))); + else + ereport(ERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("unexpected EOF"))); + } + inBuf->len = len; + inBuf->data[len] = '\0'; + + return qtype; +} + +/* + * Prepare for WAL replay on given block + */ +static void +BeginRedoForBlock(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + SMgrRelation reln; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + + INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum); + + elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u", + target_redo_tag.rnode.spcNode, + target_redo_tag.rnode.dbNode, + target_redo_tag.rnode.relNode, + target_redo_tag.forkNum, + target_redo_tag.blockNum); + + reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); + if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || + reln->smgr_cached_nblocks[forknum] < blknum + 1) + { + reln->smgr_cached_nblocks[forknum] = blknum + 1; + } +} + +/* + * Receive a page given by the client, and put it into buffer cache. + */ +static void +PushPage(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + const char *content; + Buffer buf; + Page page; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + * 8k page content + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + content = pq_getmsgbytes(input_message, BLCKSZ); + + //FIXME assume relpersistence permanent. Is it always true? + buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_ZERO_AND_LOCK, NULL, true); + wal_redo_buffer = buf; + page = BufferGetPage(buf); + + memcpy(page, content, BLCKSZ); + MarkBufferDirty(buf); /* pro forma */ + UnlockReleaseBuffer(buf); +} + +/* + * Receive a WAL record, and apply it. + * + * All the pages should be loaded into the buffer cache by PushPage calls already. + */ +static void +ApplyRecord(StringInfo input_message) +{ + char *errormsg; + XLogRecPtr lsn; + XLogRecord *record; + int nleft; + ErrorContextCallback errcallback; + DecodedXLogRecord *decoded = NULL; + + /* + * message format: + * + * LSN (the *end* of the record) + * record + */ + lsn = pq_getmsgint64(input_message); + + smgrinit(); /* reset inmem smgr state */ + + nleft = input_message->len - input_message->cursor; + /* note: the input must be aligned here */ + record = (XLogRecord *) pq_getmsgbytes(input_message, nleft); + + if (record->xl_tot_len != nleft) + elog(ERROR, "mismatch between record (%d) and message size (%d)", + record->xl_tot_len, nleft); + + /* Setup error traceback support for ereport() */ + errcallback.callback = apply_error_callback; + errcallback.arg = (void *) reader_state; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + XLogBeginRead(reader_state, lsn); + reader_state->ReadRecPtr = lsn; + + //FIXME Should we use XLogReadRecordAlloc instead? + decoded = (DecodedXLogRecord *) + palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len)); + + if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) + elog(ERROR, "failed to decode WAL record: %s", errormsg); + else + { + /* Record the location of the next record. */ + decoded->next_lsn = reader_state->NextRecPtr; + + /* + * If it's in the decode buffer, mark the decode buffer space as + * occupied. + */ + if (!decoded->oversized) + { + /* The new decode buffer head must be MAXALIGNed. */ + Assert(decoded->size == MAXALIGN(decoded->size)); + if ((char *) decoded == reader_state->decode_buffer) + reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size; + else + reader_state->decode_buffer_tail += decoded->size; + } + + /* Insert it into the queue of decoded records. */ + Assert(reader_state->decode_queue_tail != decoded); + if (reader_state->decode_queue_tail) + reader_state->decode_queue_tail->next = decoded; + reader_state->decode_queue_tail = decoded; + if (!reader_state->decode_queue_head) + reader_state->decode_queue_head = decoded; + + + /* + * Update the pointers to the beginning and one-past-the-end of this + * record, again for the benefit of historical code that expected the + * decoder to track this rather than accessing these fields of the record + * itself. + */ + reader_state->record = reader_state->decode_queue_head; + reader_state->ReadRecPtr = reader_state->record->lsn; + reader_state->EndRecPtr = reader_state->record->next_lsn; + + } + + + /* Ignore any other blocks than the ones the caller is interested in */ + redo_read_buffer_filter = redo_block_filter; + + RmgrTable[record->xl_rmid].rm_redo(reader_state); + + redo_read_buffer_filter = NULL; + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + + elog(TRACE, "applied WAL record with LSN %X/%X", + (uint32) (lsn >> 32), (uint32) lsn); +} + +/* + * Error context callback for errors occurring during ApplyRecord + */ +static void +apply_error_callback(void *arg) +{ + XLogReaderState *record = (XLogReaderState *) arg; + StringInfoData buf; + + + initStringInfo(&buf); + xlog_outdesc(&buf, record); + + /* translator: %s is a WAL record description */ + errcontext("WAL redo at %X/%X for %s", + LSN_FORMAT_ARGS(record->ReadRecPtr), + buf.data); + + + pfree(buf.data); +} + +static bool +redo_block_filter(XLogReaderState *record, uint8 block_id) +{ + BufferTag target_tag; + + XLogRecGetBlockTag(record, block_id, + &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum); + + /* + * Can a WAL redo function ever access a relation other than the one that + * it modifies? I don't see why it would. + */ + if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode)) + elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u", + target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum); + + /* + * If this block isn't one we are currently restoring, then return 'true' + * so that this gets ignored + */ + return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag); +} + +/* + * Get a page image back from buffer cache. + * + * After applying some records. + */ +static void +GetPage(StringInfo input_message) +{ + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; + Buffer buf; + Page page; + int tot_written; + + /* + * message format: + * + * spcNode + * dbNode + * relNode + * ForkNumber + * BlockNumber + */ + forknum = pq_getmsgbyte(input_message); + rnode.spcNode = pq_getmsgint(input_message, 4); + rnode.dbNode = pq_getmsgint(input_message, 4); + rnode.relNode = pq_getmsgint(input_message, 4); + blknum = pq_getmsgint(input_message, 4); + + /* FIXME: check that we got a BeginRedoForBlock message or this earlier */ + + + //FIXME assume relpersistence permanent. Is it always true? + buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_NORMAL, NULL, true); + page = BufferGetPage(buf); + /* single thread, so don't bother locking the page */ + + /* Response: Page content */ + tot_written = 0; + do { + ssize_t rc; + + rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written); + if (rc < 0) { + /* If interrupted by signal, just retry */ + if (errno == EINTR) + continue; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to stdout: %m"))); + } + tot_written += rc; + } while (tot_written < BLCKSZ); + + ReleaseBuffer(buf); + DropRelFileNodeAllLocalBuffers(rnode); + + elog(TRACE, "Page sent back for block %u", blknum); +} + + +/* Buffer used by buffered_read() */ +static char stdin_buf[16 * 1024]; +static size_t stdin_len = 0; /* # of bytes in buffer */ +static size_t stdin_ptr = 0; /* # of bytes already consumed */ + +/* + * Like read() on stdin, but buffered. + * + * We cannot use libc's buffered fread(), because it uses syscalls that we + * have disabled with seccomp(). Depending on the platform, it can call + * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat' + * seems problematic because it allows interrogating files by path name. + * + * The return value is the number of bytes read. On error, -1 is returned, and + * errno is set appropriately. Unlike read(), this fills the buffer completely + * unless an error happens or EOF is reached. + */ +static ssize_t +buffered_read(void *buf, size_t count) +{ + char *dst = buf; + + while (count > 0) + { + size_t nthis; + + if (stdin_ptr == stdin_len) + { + ssize_t ret; + + ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf)); + if (ret < 0) + { + /* don't do anything here that could set 'errno' */ + return ret; + } + if (ret == 0) + { + /* EOF */ + break; + } + stdin_len = (size_t) ret; + stdin_ptr = 0; + } + nthis = Min(stdin_len - stdin_ptr, count); + + memcpy(dst, &stdin_buf[stdin_ptr], nthis); + + stdin_ptr += nthis; + count -= nthis; + dst += nthis; + } + + return (dst - (char *) buf); +} diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 87c15b9c6f3..5c11993a6d5 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -503,6 +503,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w) case WAIT_EVENT_VACUUM_TRUNCATE: event_name = "VacuumTruncate"; break; + case WAIT_EVENT_BACK_PRESSURE: + event_name = "BackPressure"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index b4a2c8d2197..7e5b9da4032 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -23,6 +23,7 @@ #include "commands/tablespace.h" #include "miscadmin.h" #include "storage/fd.h" +#include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/numeric.h" @@ -98,6 +99,8 @@ db_dir_size(const char *path) return dirsize; } +dbsize_hook_type dbsize_hook = NULL; + /* * calculate size of database in all tablespaces */ @@ -127,6 +130,13 @@ calculate_database_size(Oid dbOid) /* Include pg_default storage */ snprintf(pathname, sizeof(pathname), "base/%u", dbOid); + + if (dbsize_hook) + { + totalsize = (*dbsize_hook)(dbOid); + return totalsize; + } + totalsize = db_dir_size(pathname); /* Scan the non-default tablespaces */ @@ -292,41 +302,17 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS) * is no check here or at the call sites for that. */ static int64 -calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum) +calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum, char relpersistence) { - int64 totalsize = 0; - char *relationpath; - char pathname[MAXPGPATH]; - unsigned int segcount = 0; + SMgrRelation srel = smgropen(*rfn, backend, relpersistence); - relationpath = relpathbackend(*rfn, backend, forknum); - - for (segcount = 0;; segcount++) + if (smgrexists(srel, forknum)) { - struct stat fst; - - CHECK_FOR_INTERRUPTS(); - - if (segcount == 0) - snprintf(pathname, MAXPGPATH, "%s", - relationpath); - else - snprintf(pathname, MAXPGPATH, "%s.%u", - relationpath, segcount); + BlockNumber n = smgrnblocks(srel, forknum); - if (stat(pathname, &fst) < 0) - { - if (errno == ENOENT) - break; - else - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", pathname))); - } - totalsize += fst.st_size; + return (int64) n * BLCKSZ; } - - return totalsize; + return 0; } Datum @@ -350,7 +336,8 @@ pg_relation_size(PG_FUNCTION_ARGS) PG_RETURN_NULL(); size = calculate_relation_size(&(rel->rd_node), rel->rd_backend, - forkname_to_number(text_to_cstring(forkName))); + forkname_to_number(text_to_cstring(forkName)), + rel->rd_rel->relpersistence); relation_close(rel, AccessShareLock); @@ -375,7 +362,8 @@ calculate_toast_table_size(Oid toastrelid) /* toast heap size, including FSM and VM size */ for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(toastRel->rd_node), - toastRel->rd_backend, forkNum); + toastRel->rd_backend, forkNum, + toastRel->rd_rel->relpersistence); /* toast index size, including FSM and VM size */ indexlist = RelationGetIndexList(toastRel); @@ -389,7 +377,8 @@ calculate_toast_table_size(Oid toastrelid) AccessShareLock); for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(toastIdxRel->rd_node), - toastIdxRel->rd_backend, forkNum); + toastIdxRel->rd_backend, forkNum, + toastIdxRel->rd_rel->relpersistence); relation_close(toastIdxRel, AccessShareLock); } @@ -418,7 +407,8 @@ calculate_table_size(Relation rel) */ for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(rel->rd_node), rel->rd_backend, - forkNum); + forkNum, + rel->rd_rel->relpersistence); /* * Size of toast relation @@ -458,7 +448,8 @@ calculate_indexes_size(Relation rel) for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(idxRel->rd_node), idxRel->rd_backend, - forkNum); + forkNum, + idxRel->rd_rel->relpersistence); relation_close(idxRel, AccessShareLock); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c410ba532d2..91c4312250d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -41,6 +41,7 @@ #include "access/twophase.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xloginsert.h" #include "access/xlogprefetcher.h" #include "access/xlogrecovery.h" #include "catalog/namespace.h" @@ -85,6 +86,7 @@ #include "replication/syncrep.h" #include "replication/walreceiver.h" #include "replication/walsender.h" +#include "replication/walproposer.h" #include "storage/bufmgr.h" #include "storage/dsm_impl.h" #include "storage/fd.h" @@ -92,6 +94,7 @@ #include "storage/pg_shmem.h" #include "storage/predicate.h" #include "storage/proc.h" +#include "storage/smgr.h" #include "storage/standby.h" #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" @@ -191,6 +194,7 @@ static int syslog_facility = 0; static void assign_syslog_facility(int newval, void *extra); static void assign_syslog_ident(const char *newval, void *extra); static void assign_session_replication_role(int newval, void *extra); + static bool check_temp_buffers(int *newval, void **extra, GucSource source); static bool check_bonjour(bool *newval, void **extra, GucSource source); static bool check_ssl(bool *newval, void **extra, GucSource source); @@ -2172,6 +2176,16 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"neon_test_evict", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Evict unpinned pages (for better test coverage)"), + }, + &zenith_test_evict, + false, + NULL, NULL, NULL + }, + + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -2347,6 +2361,28 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"wal_acceptor_reconnect", PGC_SIGHUP, REPLICATION_STANDBY, + gettext_noop("Timeout for reconnecting to offline wal acceptor."), + NULL, + GUC_UNIT_MS + }, + &wal_acceptor_reconnect_timeout, + 1000, 0, INT_MAX, + NULL, NULL, NULL + }, + + { + {"wal_acceptor_connect_timeout", PGC_SIGHUP, REPLICATION_STANDBY, + gettext_noop("Timeout after which give up connection attempt to safekeeper."), + NULL, + GUC_UNIT_MS + }, + &wal_acceptor_connect_timeout, + 5000, 0, INT_MAX, + NULL, NULL, NULL + }, + { {"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS, gettext_noop("Sets the maximum number of concurrent connections."), @@ -2973,6 +3009,42 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"max_replication_apply_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal write lag between master and replicas."), + gettext_noop("When lag between minimal apply position of replica and current LSN exceeds this value," + "backends are blocked."), + GUC_UNIT_MB, + }, + &max_replication_apply_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + + { + {"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal flush lag between master and replicas."), + gettext_noop("When lag between minimal flush position of replica and current LSN exceeds this value," + "backends are blocked"), + GUC_UNIT_MB, + }, + &max_replication_flush_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + + { + {"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal write lag between master and replicas."), + gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value," + "backends are blocked"), + GUC_UNIT_MB, + }, + &max_replication_write_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + { {"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING, gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."), @@ -4711,6 +4783,17 @@ static struct config_string ConfigureNamesString[] = check_backtrace_functions, assign_backtrace_functions, NULL }, + { + {"safekeepers", PGC_POSTMASTER, UNGROUPED, + gettext_noop("List of Neon WAL acceptors (host:port)"), + NULL, + GUC_LIST_INPUT | GUC_LIST_QUOTE + }, + &wal_acceptors_list, + "", + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL @@ -12229,6 +12312,7 @@ assign_session_replication_role(int newval, void *extra) ResetPlanCache(); } + static bool check_temp_buffers(int *newval, void **extra, GucSource source) { diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 429844fdd35..a20326694f4 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2859,6 +2859,7 @@ main(int argc, char *argv[]) {"discard-caches", no_argument, NULL, 14}, {"locale-provider", required_argument, NULL, 15}, {"icu-locale", required_argument, NULL, 16}, + {"sysid", required_argument, NULL, 17}, {NULL, 0, NULL, 0} }; @@ -3016,6 +3017,9 @@ main(int argc, char *argv[]) case 16: icu_locale = pg_strdup(optarg); break; + case 17: + boot_options = psprintf("%s -s %s", boot_options, optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 5dc60109b12..d408315ec25 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -13,6 +13,7 @@ #include "postgres.h" #include +#include #include #include #include @@ -25,8 +26,11 @@ #include "common/fe_memutils.h" #include "common/logging.h" #include "getopt_long.h" +#include "port/pg_bitutils.h" #include "rmgrdesc.h" +#define OFFSET_INVALID ((size_t)-1) + /* * NOTE: For any code change or issue fix here, it is highly recommended to * give a thought about doing the same in pg_walinspect contrib module as well. @@ -45,8 +49,10 @@ typedef struct XLogDumpPrivate XLogRecPtr startptr; XLogRecPtr endptr; bool endptr_reached; + char* input_filename; } XLogDumpPrivate; + typedef struct XLogDumpConfig { /* display options */ @@ -58,6 +64,8 @@ typedef struct XLogDumpConfig bool stats; bool stats_per_record; + bool ignore_format_errors; + /* filter options */ bool filter_by_rmgr[RM_MAX_ID + 1]; bool filter_by_rmgr_enabled; @@ -86,6 +94,34 @@ sigint_handler(int signum) } #endif +/* calculate ceil(log base 2) of num */ +static int +my_log2(long num) +{ + /* + * guard against too-large input, which would be invalid for + * pg_ceil_log2_*() + */ + if (num > LONG_MAX / 2) + num = LONG_MAX / 2; + +#if SIZEOF_LONG < 8 + return pg_ceil_log2_32(num); +#else + return pg_ceil_log2_64(num); +#endif +} + +/* calculate first power of 2 >= num, bounded to what will fit in an int */ +static int +next_pow2_int(long num) +{ + if (num > INT_MAX / 2) + num = INT_MAX / 2; + return 1 << my_log2(num); +} + + static void print_rmgr_list(void) { @@ -298,6 +334,18 @@ WALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID tli = *tli_p; char fname[MAXPGPATH]; int tries; + XLogDumpPrivate *private = state->private_data; + + if(private->input_filename) + { + Assert(nextSegNo == 0); + + state->seg.ws_file = open_file_in_directory(state->segcxt.ws_dir, private->input_filename); + if (state->seg.ws_file >= 0) + return; + + pg_fatal("could not open file \"%s\": %m", private->input_filename); + } XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); @@ -368,6 +416,7 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { WALOpenSegment *seg = &errinfo.wre_seg; char fname[MAXPGPATH]; + char *actual_fname = private->input_filename ? private->input_filename : fname; XLogFileName(fname, seg->ws_tli, seg->ws_segno, state->segcxt.ws_segsize); @@ -376,11 +425,11 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { errno = errinfo.wre_errno; pg_fatal("could not read from file %s, offset %d: %m", - fname, errinfo.wre_off); + actual_fname, errinfo.wre_off); } else pg_fatal("could not read from file %s, offset %d: read %d of %d", - fname, errinfo.wre_off, errinfo.wre_read, + actual_fname, errinfo.wre_off, errinfo.wre_read, errinfo.wre_req); } @@ -451,16 +500,26 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) uint32 fpi_len; uint8 info = XLogRecGetInfo(record); XLogRecPtr xl_prev = XLogRecGetPrev(record); + XLogDumpPrivate *private = record->private_data; StringInfoData s; XLogRecGetLen(record, &rec_len, &fpi_len); - printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", - desc->rm_name, - rec_len, XLogRecGetTotalLen(record), - XLogRecGetXid(record), - LSN_FORMAT_ARGS(record->ReadRecPtr), - LSN_FORMAT_ARGS(xl_prev)); + if(private->input_filename) + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, offset: 0x%lX, prev %X/%08X, ", + desc->rm_name, + rec_len, XLogRecGetTotalLen(record), + XLogRecGetXid(record), + record->ReadRecPtr, + LSN_FORMAT_ARGS(xl_prev)); + else + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", + desc->rm_name, + rec_len, XLogRecGetTotalLen(record), + XLogRecGetXid(record), + LSN_FORMAT_ARGS(record->ReadRecPtr), + LSN_FORMAT_ARGS(xl_prev)); + id = desc->rm_identify(info); if (id == NULL) @@ -666,7 +725,10 @@ usage(void) printf(_(" -f, --follow keep retrying after reaching end of WAL\n")); printf(_(" -F, --fork=FORK only show records that modify blocks in fork FORK;\n" " valid names are main, fsm, vm, init\n")); + printf(_(" -i, --ignore ignore format errors, skip invalid structures\n")); + printf(_(" -N, --file=FNAME dump log records from a single file\n")); printf(_(" -n, --limit=N number of records to display\n")); + printf(_(" -o, --offset=OFFSET offset of the first record to in a file to dump\n")); printf(_(" -p, --path=PATH directory in which to find log segment files or a\n" " directory with a ./pg_wal that contains such files\n" " (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n")); @@ -700,6 +762,9 @@ main(int argc, char **argv) XLogRecPtr first_record; char *waldir = NULL; char *errormsg; + char *fname = NULL; + bool single_file = false; + size_t start_offset = OFFSET_INVALID; static struct option long_options[] = { {"bkp-details", no_argument, NULL, 'b'}, @@ -707,6 +772,9 @@ main(int argc, char **argv) {"end", required_argument, NULL, 'e'}, {"follow", no_argument, NULL, 'f'}, {"fork", required_argument, NULL, 'F'}, + {"file", required_argument, NULL, 'N'}, + {"ignore", no_argument, NULL, 'i'}, + {"offset", required_argument, NULL, 'o'}, {"fullpage", no_argument, NULL, 'w'}, {"help", no_argument, NULL, '?'}, {"limit", required_argument, NULL, 'n'}, @@ -755,6 +823,7 @@ main(int argc, char **argv) private.startptr = InvalidXLogRecPtr; private.endptr = InvalidXLogRecPtr; private.endptr_reached = false; + private.input_filename = NULL; config.quiet = false; config.bkp_details = false; @@ -772,6 +841,7 @@ main(int argc, char **argv) config.filter_by_fpw = false; config.stats = false; config.stats_per_record = false; + config.ignore_format_errors = false; stats.startptr = InvalidXLogRecPtr; stats.endptr = InvalidXLogRecPtr; @@ -782,7 +852,7 @@ main(int argc, char **argv) goto bad_argument; } - while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wx:z", + while ((option = getopt_long(argc, argv, "bB:e:fF:in:N:o:p:qr:R:s:t:wx:z", long_options, &optindex)) != -1) { switch (option) @@ -821,6 +891,13 @@ main(int argc, char **argv) } config.filter_by_extended = true; break; + case 'N': + fname = pg_strdup(optarg); + single_file = true; + break; + case 'i': + config.ignore_format_errors = true; + break; case 'n': if (sscanf(optarg, "%d", &config.stop_after_records) != 1) { @@ -828,6 +905,13 @@ main(int argc, char **argv) goto bad_argument; } break; + case 'o': + if (sscanf(optarg, "%zu", &start_offset) != 1) + { + pg_log_error("could not parse offset \"%s\"", optarg); + goto bad_argument; + } + break; case 'p': waldir = pg_strdup(optarg); break; @@ -962,6 +1046,73 @@ main(int argc, char **argv) goto bad_argument; } + if (start_offset != OFFSET_INVALID) + { + if(!XLogRecPtrIsInvalid(private.startptr) || !XLogRecPtrIsInvalid(private.endptr)) + { + pg_log_error("either file offset or start/end pointers should be specified"); + goto bad_argument; + } + + if(!single_file) + { + pg_log_error("offset option could only be used with filename option"); + goto bad_argument; + } + + /* Log records are maxaligned, start at the closest next position */ + private.startptr = MAXALIGN(start_offset); + } + + if(single_file) + { + char *directory = NULL; + int fd; + struct stat stat; + + if(config.follow) + { + pg_log_error("Follow could not be used in file dump mode"); + goto bad_argument; + } + + if (waldir != NULL) + { + pg_log_error("either single file or wal directory should be specified"); + goto bad_argument; + } + + split_path(fname, &directory, &private.input_filename); + waldir = directory; + + if(waldir == NULL) + { + char *cwd = malloc(MAXPGPATH); + + if (!getcwd(cwd, MAXPGPATH)) + pg_fatal("could identify current directory: %m"); + + waldir = cwd; + } + + if (!verify_directory(waldir)) + pg_fatal("could not open directory \"%s\": %m", waldir); + + fd = open_file_in_directory(waldir, private.input_filename); + if (fd < 0) + pg_fatal("could not open file \"%s\"", private.input_filename); + + if(fstat(fd, &stat) != 0) + pg_fatal("could not stat file \"%s\"", private.input_filename); + + private.endptr = stat.st_size; + + /* Round up segment size to next power of 2 or 1MB */ + WalSegSz = Max(next_pow2_int(private.endptr), 1024 * 1024); + + close(fd); + } + if (waldir != NULL) { /* validate path points to directory */ @@ -980,6 +1131,12 @@ main(int argc, char **argv) int fd; XLogSegNo segno; + if(single_file) + { + pg_log_error("either single file or start/end boundaries should be specified"); + goto bad_argument; + } + split_path(argv[optind], &directory, &fname); if (waldir == NULL && directory != NULL) @@ -1052,10 +1209,11 @@ main(int argc, char **argv) } } else - waldir = identify_target_directory(waldir, NULL); + if (!single_file) + waldir = identify_target_directory(waldir, NULL); /* we don't know what to print */ - if (XLogRecPtrIsInvalid(private.startptr)) + if (XLogRecPtrIsInvalid(private.startptr) && !single_file) { pg_log_error("no start WAL location given"); goto bad_argument; @@ -1073,13 +1231,27 @@ main(int argc, char **argv) if (!xlogreader_state) pg_fatal("out of memory while allocating a WAL reading processor"); - /* first find a valid recptr to start from */ - first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + if(single_file) + { + if(config.ignore_format_errors) + { + xlogreader_state->skip_page_validation = true; + xlogreader_state->skip_invalid_records = true; + } - if (first_record == InvalidXLogRecPtr) - pg_fatal("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(private.startptr)); + xlogreader_state->skip_lsn_checks = true; + first_record = private.startptr; + XLogBeginRead(xlogreader_state, first_record); + } + else + { + /* first find a valid recptr to start from */ + first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + if (first_record == InvalidXLogRecPtr) + pg_fatal("could not find a valid record after %X/%X", + LSN_FORMAT_ARGS(private.startptr)); + } /* * Display a message that we're skipping data if `from` wasn't a pointer * to the start of a record and also wasn't a pointer to the beginning of diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 2d8a7f62706..3cdd4c8a75c 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -108,6 +108,7 @@ typedef struct xl_heap_delete { TransactionId xmax; /* xmax of the deleted tuple */ OffsetNumber offnum; /* deleted tuple's offset */ + uint32 t_cid; uint8 infobits_set; /* infomask bits */ uint8 flags; } xl_heap_delete; @@ -145,6 +146,7 @@ typedef struct xl_heap_header { uint16 t_infomask2; uint16 t_infomask; + uint32 t_cid; uint8 t_hoff; } xl_heap_header; @@ -186,6 +188,7 @@ typedef struct xl_multi_insert_tuple uint16 datalen; /* size of tuple data that follows */ uint16 t_infomask2; uint16 t_infomask; + uint32 t_cid; uint8 t_hoff; /* TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_multi_insert_tuple; @@ -215,9 +218,9 @@ typedef struct xl_heap_update OffsetNumber old_offnum; /* old tuple's offset */ uint8 old_infobits_set; /* infomask bits to set on old tuple */ uint8 flags; + uint32 t_cid; TransactionId new_xmax; /* xmax of the new tuple */ OffsetNumber new_offnum; /* new tuple's offset */ - /* * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags * are set, xl_heap_header and tuple data for the old tuple follow. @@ -279,6 +282,7 @@ typedef struct xl_heap_lock { TransactionId locking_xid; /* might be a MultiXactId not xid */ OffsetNumber offnum; /* locked tuple's offset on page */ + uint32 t_cid; int8 infobits_set; /* infomask and infomask2 bits to set */ uint8 flags; /* XLH_LOCK_* flag bits */ } xl_heap_lock; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index cd674c3c23f..d28c20fe0a1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -28,8 +28,13 @@ extern PGDLLIMPORT int sync_method; extern PGDLLIMPORT XLogRecPtr ProcLastRecPtr; extern PGDLLIMPORT XLogRecPtr XactLastRecEnd; + extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; +extern bool ZenithRecoveryRequested; +extern XLogRecPtr zenithLastRec; +extern bool zenithWriteOk; + /* these variables are GUC parameters related to XLOG */ extern PGDLLIMPORT int wal_segment_size; extern PGDLLIMPORT int min_wal_size_mb; @@ -97,6 +102,7 @@ extern PGDLLIMPORT int wal_level; #define XLogArchivingAlways() \ (AssertMacro(XLogArchiveMode == ARCHIVE_MODE_OFF || wal_level >= WAL_LEVEL_REPLICA), XLogArchiveMode == ARCHIVE_MODE_ALWAYS) + /* * Is WAL-logging necessary for archival or log-shipping, or can we skip * WAL-logging if we fsync() the data before committing instead? @@ -211,6 +217,7 @@ extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); extern void xlog_redo(XLogReaderState *record); extern void xlog_desc(StringInfo buf, XLogReaderState *record); extern const char *xlog_identify(uint8 info); +extern void xlog_outdesc(StringInfo buf, XLogReaderState *record); extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli); @@ -243,6 +250,18 @@ extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI); extern TimeLineID GetWALInsertionTimeLine(void); extern XLogRecPtr GetLastImportantRecPtr(void); +/* neon specifics */ + +extern void SetLastWrittenPageLSN(XLogRecPtr lsn); +extern XLogRecPtr GetLastWrittenPageLSN(void); + +extern void SetRedoStartLsn(XLogRecPtr RedoStartLSN); +extern XLogRecPtr GetRedoStartLsn(void); + +extern void SetZenithCurrentClusterSize(uint64 size); +extern uint64 GetZenithCurrentClusterSize(void); + + extern void SetWalWriterSleeping(bool sleeping); extern void assign_max_wal_size(int newval, void *extra); @@ -297,6 +316,8 @@ extern SessionBackupState get_backup_status(void); #define TABLESPACE_MAP "tablespace_map" #define TABLESPACE_MAP_OLD "tablespace_map.old" +#define ZENITH_SIGNAL_FILE "zenith.signal" + /* files to signal promotion to primary */ #define PROMOTE_SIGNAL_FILE "promote" diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h index a47e3eeb1f5..cc8b5493bb9 100644 --- a/src/include/access/xlogdefs.h +++ b/src/include/access/xlogdefs.h @@ -28,6 +28,14 @@ typedef uint64 XLogRecPtr; #define InvalidXLogRecPtr 0 #define XLogRecPtrIsInvalid(r) ((r) == InvalidXLogRecPtr) +/* + * Maximum possible XLogRecPtr value. Currently used by back pressure + * mechanism to distinguish the unknown replica flush/write position. + * This significantly simplifies comparison and checks as we always + * look for the minimal value. + */ +#define UnknownXLogRecPtr ((XLogRecPtr) ~0) + /* * First LSN to use for "fake" LSNs. * diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 5fc340c434b..65c6247bd0f 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -38,6 +38,10 @@ #define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image * is taken */ +extern int max_replication_apply_lag; +extern int max_replication_flush_lag; +extern int max_replication_write_lag; + /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); extern void XLogSetRecordFlags(uint8 flags); diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 9e63162e429..fe3dea47eae 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -216,6 +216,10 @@ struct XLogReaderState /* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */ XLogRecPtr overwrittenRecPtr; + /* Disable validation to allow dumpng corrupt WAL */ + bool skip_page_validation; + bool skip_invalid_records; + bool skip_lsn_checks; /* ---------------------------------------- * Decoded representation of current record diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index c9d0b75a01b..57cb9d84215 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -81,6 +81,8 @@ typedef struct ReadLocalXLogPageNoWaitPrivate bool end_of_wal; /* true, when end of WAL is reached */ } ReadLocalXLogPageNoWaitPrivate; +extern bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 buffer_id, Buffer *buf); extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 3233278b340..cf3698fb141 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -492,4 +492,7 @@ extern PGDLLIMPORT shmem_request_hook_type shmem_request_hook; /* in executor/nodeHash.c */ extern size_t get_hash_memory_limit(void); +/* in src/backend/tcop/zenith_wal_redo.c */ +extern bool am_wal_redo_postgres; + #endif /* MISCADMIN_H */ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d09e9f9a1c3..407280ccaf8 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -340,6 +340,9 @@ /* Define if you have a function readline library */ #undef HAVE_LIBREADLINE +/* Define to 1 if you have the `seccomp' library (-lseccomp). */ +#undef HAVE_LIBSECCOMP + /* Define to 1 if you have the `selinux' library (-lselinux). */ #undef HAVE_LIBSELINUX diff --git a/src/include/postmaster/seccomp.h b/src/include/postmaster/seccomp.h new file mode 100644 index 00000000000..1613d34bd47 --- /dev/null +++ b/src/include/postmaster/seccomp.h @@ -0,0 +1,26 @@ +#ifndef PG_SECCOMP_H +#define PG_SECCOMP_H + +#include "postgres.h" + +#ifdef HAVE_LIBSECCOMP +#include +#endif + +typedef struct { + int psr_syscall; /* syscall number */ + uint32 psr_action; /* libseccomp action, e.g. SCMP_ACT_ALLOW */ +} PgSeccompRule; + +#define PG_SCMP(syscall, action) \ + (PgSeccompRule) { \ + .psr_syscall = SCMP_SYS(syscall), \ + .psr_action = (action), \ + } + +#define PG_SCMP_ALLOW(syscall) \ + PG_SCMP(syscall, SCMP_ACT_ALLOW) + +void seccomp_load_rules(PgSeccompRule *syscalls, int count); + +#endif /* PG_SECCOMP_H */ diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h new file mode 100644 index 00000000000..c5a5b76268e --- /dev/null +++ b/src/include/replication/walproposer.h @@ -0,0 +1,565 @@ +#ifndef __WALPROPOSER_H__ +#define __WALPROPOSER_H__ + +#include "access/xlogdefs.h" +#include "postgres.h" +#include "port.h" +#include "access/xlog_internal.h" +#include "access/transam.h" +#include "nodes/replnodes.h" +#include "utils/uuid.h" +#include "replication/walreceiver.h" + +#define SK_MAGIC 0xCafeCeefu +#define SK_PROTOCOL_VERSION 2 + +#define MAX_SAFEKEEPERS 32 +#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */ +#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ +#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */ +#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */ + +/* + * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, + * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 + */ +#define WL_NO_EVENTS 0 + +extern char* wal_acceptors_list; +extern int wal_acceptor_reconnect_timeout; +extern int wal_acceptor_connect_timeout; +extern bool am_wal_proposer; + +struct WalProposerConn; /* Defined in libpqwalproposer */ +typedef struct WalProposerConn WalProposerConn; + +struct WalMessage; +typedef struct WalMessage WalMessage; + +extern char *zenith_timeline_walproposer; +extern char *zenith_tenant_walproposer; +extern char *zenith_pageserver_connstring_walproposer; + +/* Possible return values from ReadPGAsync */ +typedef enum +{ + /* The full read was successful. buf now points to the data */ + PG_ASYNC_READ_SUCCESS, + /* The read is ongoing. Wait until the connection is read-ready, then try + * again. */ + PG_ASYNC_READ_TRY_AGAIN, + /* Reading failed. Check PQerrorMessage(conn) */ + PG_ASYNC_READ_FAIL, +} PGAsyncReadResult; + +/* Possible return values from WritePGAsync */ +typedef enum +{ + /* The write fully completed */ + PG_ASYNC_WRITE_SUCCESS, + /* The write started, but you'll need to call PQflush some more times + * to finish it off. We just tried, so it's best to wait until the + * connection is read- or write-ready to try again. + * + * If it becomes read-ready, call PQconsumeInput and flush again. If it + * becomes write-ready, just call PQflush. + */ + PG_ASYNC_WRITE_TRY_FLUSH, + /* Writing failed. Check PQerrorMessage(conn) */ + PG_ASYNC_WRITE_FAIL, +} PGAsyncWriteResult; + +/* + * WAL safekeeper state, which is used to wait for some event. + * + * States are listed here in the order that they're executed. + * + * Most states, upon failure, will move back to SS_OFFLINE by calls to + * ResetConnection or ShutdownConnection. + */ +typedef enum +{ + /* + * Does not have an active connection and will stay that way until + * further notice. + * + * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. + */ + SS_OFFLINE, + + /* + * Connecting states. "_READ" waits for the socket to be available for + * reading, "_WRITE" waits for writing. There's no difference in the code + * they execute when polled, but we have this distinction in order to + * recreate the event set in HackyRemoveWalProposerEvent. + * + * After the connection is made, "START_WAL_PUSH" query is sent. + */ + SS_CONNECTING_WRITE, + SS_CONNECTING_READ, + + /* + * Waiting for the result of the "START_WAL_PUSH" command. + * + * After we get a successful result, sends handshake to safekeeper. + */ + SS_WAIT_EXEC_RESULT, + + /* + * Executing the receiving half of the handshake. After receiving, moves to + * SS_VOTING. + */ + SS_HANDSHAKE_RECV, + + /* + * Waiting to participate in voting, but a quorum hasn't yet been reached. + * This is an idle state - we do not expect AdvancePollState to be called. + * + * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a + * quorum of handshakes. + */ + SS_VOTING, + + /* + * Already sent voting information, waiting to receive confirmation from the + * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet. + */ + SS_WAIT_VERDICT, + + /* Need to flush ProposerElected message. */ + SS_SEND_ELECTED_FLUSH, + + /* + * Waiting for quorum to send WAL. Idle state. If the socket becomes + * read-ready, the connection has been closed. + * + * Moves to SS_ACTIVE only by call to StartStreaming. + */ + SS_IDLE, + + /* + * Active phase, when we acquired quorum and have WAL to send or feedback + * to read. + */ + SS_ACTIVE, +} SafekeeperState; + +/* Consensus logical timestamp. */ +typedef uint64 term_t; + +/* neon storage node id */ +typedef uint64 NNodeId; + +/* + * Proposer <-> Acceptor messaging. + */ + +/* Initial Proposer -> Acceptor message */ +typedef struct ProposerGreeting +{ + uint64 tag; /* message tag */ + uint32 protocolVersion; /* proposer-safekeeper protocol version */ + uint32 pgVersion; + pg_uuid_t proposerId; + uint64 systemId; /* Postgres system identifier */ + uint8 ztimelineid[16]; /* Zenith timeline id */ + uint8 ztenantid[16]; + TimeLineID timeline; + uint32 walSegSize; +} ProposerGreeting; + +typedef struct AcceptorProposerMessage +{ + uint64 tag; +} AcceptorProposerMessage; + +/* + * Acceptor -> Proposer initial response: the highest term acceptor voted for. + */ +typedef struct AcceptorGreeting +{ + AcceptorProposerMessage apm; + term_t term; + NNodeId nodeId; +} AcceptorGreeting; + +/* + * Proposer -> Acceptor vote request. + */ +typedef struct VoteRequest +{ + uint64 tag; + term_t term; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} VoteRequest; + +/* Element of term switching chain. */ +typedef struct TermSwitchEntry +{ + term_t term; + XLogRecPtr lsn; +} TermSwitchEntry; + +typedef struct TermHistory +{ + uint32 n_entries; + TermSwitchEntry *entries; +} TermHistory; + +/* Vote itself, sent from safekeeper to proposer */ +typedef struct VoteResponse { + AcceptorProposerMessage apm; + term_t term; + uint64 voteGiven; + /* + * Safekeeper flush_lsn (end of WAL) + history of term switches allow + * proposer to choose the most advanced one. + */ + XLogRecPtr flushLsn; + XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */ + TermHistory termHistory; + XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ +} VoteResponse; + +/* + * Proposer -> Acceptor message announcing proposer is elected and communicating + * epoch history to it. + */ +typedef struct ProposerElected +{ + uint64 tag; + term_t term; + /* proposer will send since this point */ + XLogRecPtr startStreamingAt; + /* history of term switches up to this proposer */ + TermHistory *termHistory; + /* timeline globally starts at this LSN */ + XLogRecPtr timelineStartLsn; +} ProposerElected; + +/* + * Header of request with WAL message sent from proposer to safekeeper. + */ +typedef struct AppendRequestHeader +{ + uint64 tag; + term_t term; /* term of the proposer */ + /* + * LSN since which current proposer appends WAL (begin_lsn of its first + * record); determines epoch switch point. + */ + XLogRecPtr epochStartLsn; + XLogRecPtr beginLsn; /* start position of message in WAL */ + XLogRecPtr endLsn; /* end position of message in WAL */ + XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ + /* + * minimal LSN which may be needed for recovery of some safekeeper (end lsn + * + 1 of last chunk streamed to everyone) + */ + XLogRecPtr truncateLsn; + pg_uuid_t proposerId; /* for monitoring/debugging */ +} AppendRequestHeader; + +/* + * Hot standby feedback received from replica + */ +typedef struct HotStandbyFeedback +{ + TimestampTz ts; + FullTransactionId xmin; + FullTransactionId catalog_xmin; +} HotStandbyFeedback; + + +typedef struct ReplicationFeedback +{ + // current size of the timeline on pageserver + uint64 currentClusterSize; + // standby_status_update fields that safekeeper received from pageserver + XLogRecPtr ps_writelsn; + XLogRecPtr ps_flushlsn; + XLogRecPtr ps_applylsn; + TimestampTz ps_replytime; +} ReplicationFeedback; + + +typedef struct WalproposerShmemState +{ + slock_t mutex; + ReplicationFeedback feedback; + term_t mineLastElectedTerm; +} WalproposerShmemState; + +/* + * Report safekeeper state to proposer + */ +typedef struct AppendResponse +{ + AcceptorProposerMessage apm; + /* + * Current term of the safekeeper; if it is higher than proposer's, the + * compute is out of date. + */ + term_t term; + // TODO: add comment + XLogRecPtr flushLsn; + // Safekeeper reports back his awareness about which WAL is committed, as + // this is a criterion for walproposer --sync mode exit + XLogRecPtr commitLsn; + HotStandbyFeedback hs; + // Feedback recieved from pageserver includes standby_status_update fields + // and custom zenith feedback. + // This part of the message is extensible. + ReplicationFeedback rf; +} AppendResponse; + +// ReplicationFeedback is extensible part of the message that is parsed separately +// Other fields are fixed part +#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) + + +/* + * Descriptor of safekeeper + */ +typedef struct Safekeeper +{ + char const* host; + char const* port; + char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */ + + /* + * postgres protocol connection to the WAL acceptor + * + * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we + * reach SS_ACTIVE; not before. + */ + WalProposerConn* conn; + /* + * Temporary buffer for the message being sent to the safekeeper. + */ + StringInfoData outbuf; + /* + * WAL reader, allocated for each safekeeper. + */ + XLogReaderState* xlogreader; + + /* + * Streaming will start here; must be record boundary. + */ + XLogRecPtr startStreamingAt; + + bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */ + XLogRecPtr streamingAt; /* current streaming position */ + AppendRequestHeader appendRequest; /* request for sending to safekeeper */ + + int eventPos; /* position in wait event set. Equal to -1 if no event */ + SafekeeperState state; /* safekeeper state machine state */ + TimestampTz startedConnAt; /* when connection attempt started */ + AcceptorGreeting greetResponse; /* acceptor greeting */ + VoteResponse voteResponse; /* the vote */ + AppendResponse appendResponse; /* feedback for master */ +} Safekeeper; + + +int CompareLsn(const void *a, const void *b); +char* FormatSafekeeperState(SafekeeperState state); +void AssertEventsOkForState(uint32 events, Safekeeper* sk); +uint32 SafekeeperStateDesiredEvents(SafekeeperState state); +char* FormatEvents(uint32 events); +void WalProposerMain(Datum main_arg); +void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); +bool HexDecodeString(uint8 *result, char *input, int nbytes); +uint32 pq_getmsgint32_le(StringInfo msg); +uint64 pq_getmsgint64_le(StringInfo msg); +void pq_sendint32_le(StringInfo buf, uint32 i); +void pq_sendint64_le(StringInfo buf, uint64 i); +void WalProposerPoll(void); +void WalProposerRegister(void); +void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); +void XLogWalPropClose(XLogRecPtr recptr); +void ProcessStandbyReply(XLogRecPtr writePtr, + XLogRecPtr flushPtr, + XLogRecPtr applyPtr, + TimestampTz replyTime, + bool replyRequested); +void PhysicalConfirmReceivedLocation(XLogRecPtr lsn); +void ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch); +void ParseReplicationFeedbackMessage(StringInfo reply_message, + ReplicationFeedback *rf); +void StartReplication(StartReplicationCmd *cmd); +void WalProposerSync(int argc, char *argv[]); + +Size WalproposerShmemSize(void); +bool WalproposerShmemInit(void); +void replication_feedback_set(ReplicationFeedback *rf); +void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); + +/* libpqwalproposer hooks & helper type */ + +/* Re-exported PostgresPollingStatusType */ +typedef enum +{ + WP_CONN_POLLING_FAILED = 0, + WP_CONN_POLLING_READING, + WP_CONN_POLLING_WRITING, + WP_CONN_POLLING_OK, + /* + * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. + * We've removed it here to avoid clutter. + */ +} WalProposerConnectPollStatusType; + +/* Re-exported and modified ExecStatusType */ +typedef enum +{ + /* We received a single CopyBoth result */ + WP_EXEC_SUCCESS_COPYBOTH, + /* Any success result other than a single CopyBoth was received. The specifics of the result + * were already logged, but it may be useful to provide an error message indicating which + * safekeeper messed up. + * + * Do not expect PQerrorMessage to be appropriately set. */ + WP_EXEC_UNEXPECTED_SUCCESS, + /* No result available at this time. Wait until read-ready, then call again. Internally, this is + * returned when PQisBusy indicates that PQgetResult would block. */ + WP_EXEC_NEEDS_INPUT, + /* Catch-all failure. Check PQerrorMessage. */ + WP_EXEC_FAILED, +} WalProposerExecStatusType; + +/* Re-exported ConnStatusType */ +typedef enum +{ + WP_CONNECTION_OK, + WP_CONNECTION_BAD, + + /* + * The original ConnStatusType has many more tags, but requests that + * they not be relied upon (except for displaying to the user). We + * don't need that extra functionality, so we collect them into a + * single tag here. + */ + WP_CONNECTION_IN_PROGRESS, +} WalProposerConnStatusType; + +/* Re-exported PQerrorMessage */ +typedef char* (*walprop_error_message_fn) (WalProposerConn* conn); + +/* Re-exported PQstatus */ +typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn); + +/* Re-exported PQconnectStart */ +typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo); + +/* Re-exported PQconectPoll */ +typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn); + +/* Blocking wrapper around PQsendQuery */ +typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query); + +/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ +typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn); + +/* Re-exported PQsocket */ +typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn); + +/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ +typedef int (*walprop_flush_fn) (WalProposerConn* conn); + +/* Re-exported PQfinish */ +typedef void (*walprop_finish_fn) (WalProposerConn* conn); + +/* + * Ergonomic wrapper around PGgetCopyData + * + * Reads a CopyData block from a safekeeper, setting *amount to the number + * of bytes returned. + * + * This function is allowed to assume certain properties specific to the + * protocol with the safekeepers, so it should not be used as-is for any + * other purpose. + * + * Note: If possible, using is generally preferred, because it + * performs a bit of extra checking work that's always required and is normally + * somewhat verbose. + */ +typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, + char** buf, + int* amount); + +/* + * Ergonomic wrapper around PQputCopyData + PQflush + * + * Starts to write a CopyData block to a safekeeper. + * + * For information on the meaning of return codes, refer to PGAsyncWriteResult. + */ +typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn, + void const* buf, + size_t size); + +/* + * Blocking equivalent to walprop_async_write_fn + * + * Returns 'true' if successful, 'false' on failure. + */ +typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size); + +/* All libpqwalproposer exported functions collected together. */ +typedef struct WalProposerFunctionsType +{ + walprop_error_message_fn walprop_error_message; + walprop_status_fn walprop_status; + walprop_connect_start_fn walprop_connect_start; + walprop_connect_poll_fn walprop_connect_poll; + walprop_send_query_fn walprop_send_query; + walprop_get_query_result_fn walprop_get_query_result; + walprop_socket_fn walprop_socket; + walprop_flush_fn walprop_flush; + walprop_finish_fn walprop_finish; + walprop_async_read_fn walprop_async_read; + walprop_async_write_fn walprop_async_write; + walprop_blocking_write_fn walprop_blocking_write; +} WalProposerFunctionsType; + +/* Allow the above functions to be "called" with normal syntax */ +#define walprop_error_message(conn) \ + WalProposerFunctions->walprop_error_message(conn) +#define walprop_status(conn) \ + WalProposerFunctions->walprop_status(conn) +#define walprop_connect_start(conninfo) \ + WalProposerFunctions->walprop_connect_start(conninfo) +#define walprop_connect_poll(conn) \ + WalProposerFunctions->walprop_connect_poll(conn) +#define walprop_send_query(conn, query) \ + WalProposerFunctions->walprop_send_query(conn, query) +#define walprop_get_query_result(conn) \ + WalProposerFunctions->walprop_get_query_result(conn) +#define walprop_set_nonblocking(conn, arg) \ + WalProposerFunctions->walprop_set_nonblocking(conn, arg) +#define walprop_socket(conn) \ + WalProposerFunctions->walprop_socket(conn) +#define walprop_flush(conn) \ + WalProposerFunctions->walprop_flush(conn) +#define walprop_finish(conn) \ + WalProposerFunctions->walprop_finish(conn) +#define walprop_async_read(conn, buf, amount) \ + WalProposerFunctions->walprop_async_read(conn, buf, amount) +#define walprop_async_write(conn, buf, size) \ + WalProposerFunctions->walprop_async_write(conn, buf, size) +#define walprop_blocking_write(conn, buf, size) \ + WalProposerFunctions->walprop_blocking_write(conn, buf, size) + +/* + * The runtime location of the libpqwalproposer functions. + * + * This pointer is set by the initializer in libpqwalproposer, so that we + * can use it later. + */ +extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; + +#endif diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index d99a21b0771..599687ec428 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -12,6 +12,7 @@ #ifndef _WALSENDER_H #define _WALSENDER_H +#include "access/xlog.h" #include /* @@ -47,7 +48,8 @@ extern void WalSndInitStopping(void); extern void WalSndWaitStopping(void); extern void HandleWalSndInitStopping(void); extern void WalSndRqstFileReload(void); - +extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush, XLogRecPtr* apply); +extern uint64 backpressure_lag(void); /* * Remember that we want to wakeup walsenders later * diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 58391406f65..d6a1c6f26fd 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -76,6 +76,8 @@ extern PGDLLIMPORT int checkpoint_flush_after; extern PGDLLIMPORT int backend_flush_after; extern PGDLLIMPORT int bgwriter_flush_after; +extern bool zenith_test_evict; + /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; @@ -228,6 +230,8 @@ extern void BufferGetTag(Buffer buffer, RelFileNode *rnode, extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std); +extern void MarkBufferPermanent(Buffer buffer); + extern void UnlockBuffers(void); extern void LockBuffer(Buffer buffer, int mode); extern bool ConditionalLockBuffer(Buffer buffer); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 6b63c60fbd9..6e9fdd72367 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -18,6 +18,8 @@ #include "storage/block.h" #include "storage/relfilenode.h" +struct f_smgr; + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -41,6 +43,9 @@ typedef struct SMgrRelationData /* rnode is the hashtable lookup key, so it must be first! */ RelFileNodeBackend smgr_rnode; /* relation physical identifier */ + /* copy of pg_class.relpersistence, or 0 if not known */ + char smgr_relpersistence; + /* pointer to owning pointer, or NULL if none */ struct SMgrRelationData **smgr_owner; @@ -59,7 +64,7 @@ typedef struct SMgrRelationData * Fields below here are intended to be private to smgr.c and its * submodules. Do not touch them from elsewhere. */ - int smgr_which; /* storage manager selector */ + const struct f_smgr *smgr; /* * for md.c; per-fork arrays of the number of open segments @@ -77,8 +82,67 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileNodeBackendIsTemp((smgr)->smgr_rnode) + +/* + * This struct of function pointers defines the API between smgr.c and + * any individual storage manager module. Note that smgr subfunctions are + * generally expected to report problems via elog(ERROR). An exception is + * that smgr_unlink should use elog(WARNING), rather than erroring out, + * because we normally unlink relations during post-commit/abort cleanup, + * and so it's too late to raise an error. Also, various conditions that + * would normally be errors should be allowed during bootstrap and/or WAL + * recovery --- see comments in md.c for details. + */ +typedef struct f_smgr +{ + void (*smgr_init) (void); /* may be NULL */ + void (*smgr_shutdown) (void); /* may be NULL */ + void (*smgr_open) (SMgrRelation reln); + void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, + bool isRedo); + bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum, + bool isRedo); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); + bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); + void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); + void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); + BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); + void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); + + void (*smgr_start_unlogged_build) (SMgrRelation reln); + void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); + void (*smgr_end_unlogged_build) (SMgrRelation reln); +} f_smgr; + +typedef void (*smgr_init_hook_type) (void); +typedef void (*smgr_shutdown_hook_type) (void); +extern PGDLLIMPORT smgr_init_hook_type smgr_init_hook; +extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook; +extern void smgr_init_standard(void); +extern void smgr_shutdown_standard(void); + +// Alternative implementation of calculate_database_size() +typedef int64 (*dbsize_hook_type) (Oid dbOid); +extern PGDLLIMPORT dbsize_hook_type dbsize_hook; + +typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileNode rnode); +extern PGDLLIMPORT smgr_hook_type smgr_hook; +extern const f_smgr *smgr_standard(BackendId backend, RelFileNode rnode); + +extern const f_smgr *smgr(BackendId backend, RelFileNode rnode); + extern void smgrinit(void); -extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); +extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend, char relpersistence); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln); @@ -108,4 +172,8 @@ extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void AtEOXact_SMgr(void); extern bool ProcessBarrierSmgrRelease(void); +extern void smgr_start_unlogged_build(SMgrRelation reln); +extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); +extern void smgr_end_unlogged_build(SMgrRelation reln); + #endif /* SMGR_H */ diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h index 70d9dab25b8..d29674cd93d 100644 --- a/src/include/tcop/tcopprot.h +++ b/src/include/tcop/tcopprot.h @@ -94,4 +94,8 @@ extern bool set_plan_disabling_options(const char *arg, GucContext context, GucSource source); extern const char *get_stats_option_name(const char *arg); +extern void WalRedoMain(int argc, char *argv[], + const char *dbname, + const char *username); + #endif /* TCOPPROT_H */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index a1bc0717567..4f333a21fa1 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -556,7 +556,7 @@ static inline SMgrRelation RelationGetSmgr(Relation rel) { if (unlikely(rel->rd_smgr == NULL)) - smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_node, rel->rd_backend)); + smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_node, rel->rd_backend, rel->rd_rel->relpersistence)); return rel->rd_smgr; } #endif /* !FRONTEND */ diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index b578e2ec757..5569dee5345 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -146,7 +146,8 @@ typedef enum WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL, WAIT_EVENT_REGISTER_SYNC_REQUEST, WAIT_EVENT_VACUUM_DELAY, - WAIT_EVENT_VACUUM_TRUNCATE + WAIT_EVENT_VACUUM_TRUNCATE, + WAIT_EVENT_BACK_PRESSURE } WaitEventTimeout; /* ---------- diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out new file mode 100644 index 00000000000..0f116d8750d --- /dev/null +++ b/src/test/regress/expected/alter_table_1.out @@ -0,0 +1,4487 @@ +-- +-- ALTER_TABLE +-- +-- Clean up in case a prior regression run failed +SET client_min_messages TO 'warning'; +DROP ROLE IF EXISTS regress_alter_table_user1; +RESET client_min_messages; +CREATE USER regress_alter_table_user1; +-- +-- add attribute +-- +CREATE TABLE attmp (initial int4); +COMMENT ON TABLE attmp_wrong IS 'table comment'; +ERROR: relation "attmp_wrong" does not exist +COMMENT ON TABLE attmp IS 'table comment'; +COMMENT ON TABLE attmp IS NULL; +ALTER TABLE attmp ADD COLUMN xmin integer; -- fails +ERROR: column name "xmin" conflicts with a system column name +ALTER TABLE attmp ADD COLUMN a int4 default 3; +ALTER TABLE attmp ADD COLUMN b name; +ALTER TABLE attmp ADD COLUMN c text; +ALTER TABLE attmp ADD COLUMN d float8; +ALTER TABLE attmp ADD COLUMN e float4; +ALTER TABLE attmp ADD COLUMN f int2; +ALTER TABLE attmp ADD COLUMN g polygon; +ALTER TABLE attmp ADD COLUMN i char; +ALTER TABLE attmp ADD COLUMN k int4; +ALTER TABLE attmp ADD COLUMN l tid; +ALTER TABLE attmp ADD COLUMN m xid; +ALTER TABLE attmp ADD COLUMN n oidvector; +--ALTER TABLE attmp ADD COLUMN o lock; +ALTER TABLE attmp ADD COLUMN p boolean; +ALTER TABLE attmp ADD COLUMN q point; +ALTER TABLE attmp ADD COLUMN r lseg; +ALTER TABLE attmp ADD COLUMN s path; +ALTER TABLE attmp ADD COLUMN t box; +ALTER TABLE attmp ADD COLUMN v timestamp; +ALTER TABLE attmp ADD COLUMN w interval; +ALTER TABLE attmp ADD COLUMN x float8[]; +ALTER TABLE attmp ADD COLUMN y float4[]; +ALTER TABLE attmp ADD COLUMN z int2[]; +INSERT INTO attmp (a, b, c, d, e, f, g, i, k, l, m, n, p, q, r, s, t, + v, w, x, y, z) + VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)', + 'c', + 314159, '(1,1)', '512', + '1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)', + '(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)', + 'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}'); +SELECT * FROM attmp; + initial | a | b | c | d | e | f | g | i | k | l | m | n | p | q | r | s | t | v | w | x | y | z +---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+----------- + | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4} +(1 row) + +DROP TABLE attmp; +-- the wolf bug - schema mods caused inconsistent row descriptors +CREATE TABLE attmp ( + initial int4 +); +ALTER TABLE attmp ADD COLUMN a int4; +ALTER TABLE attmp ADD COLUMN b name; +ALTER TABLE attmp ADD COLUMN c text; +ALTER TABLE attmp ADD COLUMN d float8; +ALTER TABLE attmp ADD COLUMN e float4; +ALTER TABLE attmp ADD COLUMN f int2; +ALTER TABLE attmp ADD COLUMN g polygon; +ALTER TABLE attmp ADD COLUMN i char; +ALTER TABLE attmp ADD COLUMN k int4; +ALTER TABLE attmp ADD COLUMN l tid; +ALTER TABLE attmp ADD COLUMN m xid; +ALTER TABLE attmp ADD COLUMN n oidvector; +--ALTER TABLE attmp ADD COLUMN o lock; +ALTER TABLE attmp ADD COLUMN p boolean; +ALTER TABLE attmp ADD COLUMN q point; +ALTER TABLE attmp ADD COLUMN r lseg; +ALTER TABLE attmp ADD COLUMN s path; +ALTER TABLE attmp ADD COLUMN t box; +ALTER TABLE attmp ADD COLUMN v timestamp; +ALTER TABLE attmp ADD COLUMN w interval; +ALTER TABLE attmp ADD COLUMN x float8[]; +ALTER TABLE attmp ADD COLUMN y float4[]; +ALTER TABLE attmp ADD COLUMN z int2[]; +INSERT INTO attmp (a, b, c, d, e, f, g, i, k, l, m, n, p, q, r, s, t, + v, w, x, y, z) + VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)', + 'c', + 314159, '(1,1)', '512', + '1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)', + '(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)', + 'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}'); +SELECT * FROM attmp; + initial | a | b | c | d | e | f | g | i | k | l | m | n | p | q | r | s | t | v | w | x | y | z +---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+----------- + | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4} +(1 row) + +CREATE INDEX attmp_idx ON attmp (a, (d + e), b); +ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000; +ERROR: column number must be in range from 1 to 32767 +LINE 1: ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000; + ^ +ALTER INDEX attmp_idx ALTER COLUMN 1 SET STATISTICS 1000; +ERROR: cannot alter statistics on non-expression column "a" of index "attmp_idx" +HINT: Alter statistics on table column instead. +ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS 1000; +\d+ attmp_idx + Index "public.attmp_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+------------------+------+------------+---------+-------------- + a | integer | yes | a | plain | + expr | double precision | yes | (d + e) | plain | 1000 + b | cstring | yes | b | plain | +btree, for table "public.attmp" + +ALTER INDEX attmp_idx ALTER COLUMN 3 SET STATISTICS 1000; +ERROR: cannot alter statistics on non-expression column "b" of index "attmp_idx" +HINT: Alter statistics on table column instead. +ALTER INDEX attmp_idx ALTER COLUMN 4 SET STATISTICS 1000; +ERROR: column number 4 of relation "attmp_idx" does not exist +ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS -1; +DROP TABLE attmp; +-- +-- rename - check on both non-temp and temp tables +-- +CREATE TABLE attmp (regtable int); +CREATE TEMP TABLE attmp (attmptable int); +ALTER TABLE attmp RENAME TO attmp_new; +SELECT * FROM attmp; + regtable +---------- +(0 rows) + +SELECT * FROM attmp_new; + attmptable +------------ +(0 rows) + +ALTER TABLE attmp RENAME TO attmp_new2; +SELECT * FROM attmp; -- should fail +ERROR: relation "attmp" does not exist +LINE 1: SELECT * FROM attmp; + ^ +SELECT * FROM attmp_new; + attmptable +------------ +(0 rows) + +SELECT * FROM attmp_new2; + regtable +---------- +(0 rows) + +DROP TABLE attmp_new; +DROP TABLE attmp_new2; +-- check rename of partitioned tables and indexes also +CREATE TABLE part_attmp (a int primary key) partition by range (a); +CREATE TABLE part_attmp1 PARTITION OF part_attmp FOR VALUES FROM (0) TO (100); +ALTER INDEX part_attmp_pkey RENAME TO part_attmp_index; +ALTER INDEX part_attmp1_pkey RENAME TO part_attmp1_index; +ALTER TABLE part_attmp RENAME TO part_at2tmp; +ALTER TABLE part_attmp1 RENAME TO part_at2tmp1; +SET ROLE regress_alter_table_user1; +ALTER INDEX part_attmp_index RENAME TO fail; +ERROR: must be owner of index part_attmp_index +ALTER INDEX part_attmp1_index RENAME TO fail; +ERROR: must be owner of index part_attmp1_index +ALTER TABLE part_at2tmp RENAME TO fail; +ERROR: must be owner of table part_at2tmp +ALTER TABLE part_at2tmp1 RENAME TO fail; +ERROR: must be owner of table part_at2tmp1 +RESET ROLE; +DROP TABLE part_at2tmp; +-- +-- check renaming to a table's array type's autogenerated name +-- (the array type's name should get out of the way) +-- +CREATE TABLE attmp_array (id int); +CREATE TABLE attmp_array2 (id int); +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +-------------- + _attmp_array +(1 row) + +SELECT typname FROM pg_type WHERE oid = 'attmp_array2[]'::regtype; + typname +--------------- + _attmp_array2 +(1 row) + +ALTER TABLE attmp_array2 RENAME TO _attmp_array; +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +--------------- + __attmp_array +(1 row) + +SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype; + typname +---------------- + ___attmp_array +(1 row) + +DROP TABLE _attmp_array; +DROP TABLE attmp_array; +-- renaming to table's own array type's name is an interesting corner case +CREATE TABLE attmp_array (id int); +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +-------------- + _attmp_array +(1 row) + +ALTER TABLE attmp_array RENAME TO _attmp_array; +SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype; + typname +--------------- + __attmp_array +(1 row) + +DROP TABLE _attmp_array; +-- ALTER TABLE ... RENAME on non-table relations +-- renaming indexes (FIXME: this should probably test the index's functionality) +ALTER INDEX IF EXISTS __onek_unique1 RENAME TO attmp_onek_unique1; +NOTICE: relation "__onek_unique1" does not exist, skipping +ALTER INDEX IF EXISTS __attmp_onek_unique1 RENAME TO onek_unique1; +NOTICE: relation "__attmp_onek_unique1" does not exist, skipping +ALTER INDEX onek_unique1 RENAME TO attmp_onek_unique1; +ALTER INDEX attmp_onek_unique1 RENAME TO onek_unique1; +SET ROLE regress_alter_table_user1; +ALTER INDEX onek_unique1 RENAME TO fail; -- permission denied +ERROR: must be owner of index onek_unique1 +RESET ROLE; +-- renaming views +CREATE VIEW attmp_view (unique1) AS SELECT unique1 FROM tenk1; +ALTER TABLE attmp_view RENAME TO attmp_view_new; +SET ROLE regress_alter_table_user1; +ALTER VIEW attmp_view_new RENAME TO fail; -- permission denied +ERROR: must be owner of view attmp_view_new +RESET ROLE; +-- hack to ensure we get an indexscan here +set enable_seqscan to off; +set enable_bitmapscan to off; +-- 5 values, sorted +SELECT unique1 FROM tenk1 WHERE unique1 < 5; + unique1 +--------- + 0 + 1 + 2 + 3 + 4 +(5 rows) + +reset enable_seqscan; +reset enable_bitmapscan; +DROP VIEW attmp_view_new; +-- toast-like relation name +alter table stud_emp rename to pg_toast_stud_emp; +alter table pg_toast_stud_emp rename to stud_emp; +-- renaming index should rename constraint as well +ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1); +ALTER INDEX onek_unique1_constraint RENAME TO onek_unique1_constraint_foo; +ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo; +-- renaming constraint +ALTER TABLE onek ADD CONSTRAINT onek_check_constraint CHECK (unique1 >= 0); +ALTER TABLE onek RENAME CONSTRAINT onek_check_constraint TO onek_check_constraint_foo; +ALTER TABLE onek DROP CONSTRAINT onek_check_constraint_foo; +-- renaming constraint should rename index as well +ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1); +DROP INDEX onek_unique1_constraint; -- to see whether it's there +ERROR: cannot drop index onek_unique1_constraint because constraint onek_unique1_constraint on table onek requires it +HINT: You can drop constraint onek_unique1_constraint on table onek instead. +ALTER TABLE onek RENAME CONSTRAINT onek_unique1_constraint TO onek_unique1_constraint_foo; +DROP INDEX onek_unique1_constraint_foo; -- to see whether it's there +ERROR: cannot drop index onek_unique1_constraint_foo because constraint onek_unique1_constraint_foo on table onek requires it +HINT: You can drop constraint onek_unique1_constraint_foo on table onek instead. +ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo; +-- renaming constraints vs. inheritance +CREATE TABLE constraint_rename_test (a int CONSTRAINT con1 CHECK (a > 0), b int, c int); +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1" CHECK (a > 0) + +CREATE TABLE constraint_rename_test2 (a int CONSTRAINT con1 CHECK (a > 0), d int) INHERITS (constraint_rename_test); +NOTICE: merging column "a" with inherited definition +NOTICE: merging constraint "con1" with inherited definition +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test2 RENAME CONSTRAINT con1 TO con1foo; -- fail +ERROR: cannot rename inherited constraint "con1" +ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- fail +ERROR: inherited constraint "con1" must be renamed in child tables too +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test ADD CONSTRAINT con2 CHECK (b > 0) NO INHERIT; +ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con2 TO con2foo; -- ok +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con2foo TO con2bar; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) + "con2bar" CHECK (b > 0) NO INHERIT +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test ADD CONSTRAINT con3 PRIMARY KEY (a); +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con3 TO con3foo; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | | + c | integer | | | +Indexes: + "con3foo" PRIMARY KEY, btree (a) +Check constraints: + "con1foo" CHECK (a > 0) + "con2bar" CHECK (b > 0) NO INHERIT +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +DROP TABLE constraint_rename_test2; +DROP TABLE constraint_rename_test; +ALTER TABLE IF EXISTS constraint_not_exist RENAME CONSTRAINT con3 TO con3foo; -- ok +NOTICE: relation "constraint_not_exist" does not exist, skipping +ALTER TABLE IF EXISTS constraint_rename_test ADD CONSTRAINT con4 UNIQUE (a); +NOTICE: relation "constraint_rename_test" does not exist, skipping +-- renaming constraints with cache reset of target relation +CREATE TABLE constraint_rename_cache (a int, + CONSTRAINT chk_a CHECK (a > 0), + PRIMARY KEY (a)); +ALTER TABLE constraint_rename_cache + RENAME CONSTRAINT chk_a TO chk_a_new; +ALTER TABLE constraint_rename_cache + RENAME CONSTRAINT constraint_rename_cache_pkey TO constraint_rename_pkey_new; +CREATE TABLE like_constraint_rename_cache + (LIKE constraint_rename_cache INCLUDING ALL); +\d like_constraint_rename_cache + Table "public.like_constraint_rename_cache" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | +Indexes: + "like_constraint_rename_cache_pkey" PRIMARY KEY, btree (a) +Check constraints: + "chk_a_new" CHECK (a > 0) + +DROP TABLE constraint_rename_cache; +DROP TABLE like_constraint_rename_cache; +-- FOREIGN KEY CONSTRAINT adding TEST +CREATE TABLE attmp2 (a int primary key); +CREATE TABLE attmp3 (a int, b int); +CREATE TABLE attmp4 (a int, b int, unique(a,b)); +CREATE TABLE attmp5 (a int, b int); +-- Insert rows into attmp2 (pktable) +INSERT INTO attmp2 values (1); +INSERT INTO attmp2 values (2); +INSERT INTO attmp2 values (3); +INSERT INTO attmp2 values (4); +-- Insert rows into attmp3 +INSERT INTO attmp3 values (1,10); +INSERT INTO attmp3 values (1,20); +INSERT INTO attmp3 values (5,50); +-- Try (and fail) to add constraint due to invalid source columns +ALTER TABLE attmp3 add constraint attmpconstr foreign key(c) references attmp2 match full; +ERROR: column "c" referenced in foreign key constraint does not exist +-- Try (and fail) to add constraint due to invalid destination columns explicitly given +ALTER TABLE attmp3 add constraint attmpconstr foreign key(a) references attmp2(b) match full; +ERROR: column "b" referenced in foreign key constraint does not exist +-- Try (and fail) to add constraint due to invalid data +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full; +ERROR: insert or update on table "attmp3" violates foreign key constraint "attmpconstr" +DETAIL: Key (a)=(5) is not present in table "attmp2". +-- Delete failing row +DELETE FROM attmp3 where a=5; +-- Try (and succeed) +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full; +ALTER TABLE attmp3 drop constraint attmpconstr; +INSERT INTO attmp3 values (5,50); +-- Try NOT VALID and then VALIDATE CONSTRAINT, but fails. Delete failure then re-validate +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full NOT VALID; +ALTER TABLE attmp3 validate constraint attmpconstr; +ERROR: insert or update on table "attmp3" violates foreign key constraint "attmpconstr" +DETAIL: Key (a)=(5) is not present in table "attmp2". +-- Delete failing row +DELETE FROM attmp3 where a=5; +-- Try (and succeed) and repeat to show it works on already valid constraint +ALTER TABLE attmp3 validate constraint attmpconstr; +ALTER TABLE attmp3 validate constraint attmpconstr; +-- Try a non-verified CHECK constraint +ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10); -- fail +ERROR: check constraint "b_greater_than_ten" of relation "attmp3" is violated by some row +ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10) NOT VALID; -- succeeds +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- fails +ERROR: check constraint "b_greater_than_ten" of relation "attmp3" is violated by some row +DELETE FROM attmp3 WHERE NOT b > 10; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds +-- Test inherited NOT VALID CHECK constraints +select * from attmp3; + a | b +---+---- + 1 | 20 +(1 row) + +CREATE TABLE attmp6 () INHERITS (attmp3); +CREATE TABLE attmp7 () INHERITS (attmp3); +INSERT INTO attmp6 VALUES (6, 30), (7, 16); +ALTER TABLE attmp3 ADD CONSTRAINT b_le_20 CHECK (b <= 20) NOT VALID; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20; -- fails +ERROR: check constraint "b_le_20" of relation "attmp6" is violated by some row +DELETE FROM attmp6 WHERE b > 20; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20; -- succeeds +-- An already validated constraint must not be revalidated +CREATE FUNCTION boo(int) RETURNS int IMMUTABLE STRICT LANGUAGE plpgsql AS $$ BEGIN RAISE NOTICE 'boo: %', $1; RETURN $1; END; $$; +INSERT INTO attmp7 VALUES (8, 18); +ALTER TABLE attmp7 ADD CONSTRAINT identity CHECK (b = boo(b)); +NOTICE: boo: 18 +ALTER TABLE attmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID; +NOTICE: merging constraint "identity" with inherited definition +ALTER TABLE attmp3 VALIDATE CONSTRAINT identity; +NOTICE: boo: 20 +NOTICE: boo: 16 +-- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT +create table parent_noinh_convalid (a int); +create table child_noinh_convalid () inherits (parent_noinh_convalid); +insert into parent_noinh_convalid values (1); +insert into child_noinh_convalid values (1); +alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid; +-- fail, because of the row in parent +alter table parent_noinh_convalid validate constraint check_a_is_2; +ERROR: check constraint "check_a_is_2" of relation "parent_noinh_convalid" is violated by some row +delete from only parent_noinh_convalid; +-- ok (parent itself contains no violating rows) +alter table parent_noinh_convalid validate constraint check_a_is_2; +select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2'; + convalidated +-------------- + t +(1 row) + +-- cleanup +drop table parent_noinh_convalid, child_noinh_convalid; +-- Try (and fail) to create constraint from attmp5(a) to attmp4(a) - unique constraint on +-- attmp4 is a,b +ALTER TABLE attmp5 add constraint attmpconstr foreign key(a) references attmp4(a) match full; +ERROR: there is no unique constraint matching given keys for referenced table "attmp4" +DROP TABLE attmp7; +DROP TABLE attmp6; +DROP TABLE attmp5; +DROP TABLE attmp4; +DROP TABLE attmp3; +DROP TABLE attmp2; +-- NOT VALID with plan invalidation -- ensure we don't use a constraint for +-- exclusion until validated +set constraint_exclusion TO 'partition'; +create table nv_parent (d date, check (false) no inherit not valid); +-- not valid constraint added at creation time should automatically become valid +\d nv_parent + Table "public.nv_parent" + Column | Type | Collation | Nullable | Default +--------+------+-----------+----------+--------- + d | date | | | +Check constraints: + "nv_parent_check" CHECK (false) NO INHERIT + +create table nv_child_2010 () inherits (nv_parent); +create table nv_child_2011 () inherits (nv_parent); +alter table nv_child_2010 add check (d between '2010-01-01'::date and '2010-12-31'::date) not valid; +alter table nv_child_2011 add check (d between '2011-01-01'::date and '2011-12-31'::date) not valid; +explain (costs off) select * from nv_parent where d between '2011-08-01' and '2011-08-31'; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent nv_parent_1 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2010 nv_parent_2 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2011 nv_parent_3 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) +(7 rows) + +create table nv_child_2009 (check (d between '2009-01-01'::date and '2009-12-31'::date)) inherits (nv_parent); +explain (costs off) select * from nv_parent where d between '2011-08-01'::date and '2011-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent nv_parent_1 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2010 nv_parent_2 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2011 nv_parent_3 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) +(7 rows) + +explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent nv_parent_1 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2010 nv_parent_2 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2011 nv_parent_3 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2009 nv_parent_4 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) +(9 rows) + +-- after validation, the constraint should be used +alter table nv_child_2011 VALIDATE CONSTRAINT nv_child_2011_d_check; +explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent nv_parent_1 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2010 nv_parent_2 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2009 nv_parent_3 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) +(7 rows) + +-- add an inherited NOT VALID constraint +alter table nv_parent add check (d between '2001-01-01'::date and '2099-12-31'::date) not valid; +\d nv_child_2009 + Table "public.nv_child_2009" + Column | Type | Collation | Nullable | Default +--------+------+-----------+----------+--------- + d | date | | | +Check constraints: + "nv_child_2009_d_check" CHECK (d >= '01-01-2009'::date AND d <= '12-31-2009'::date) + "nv_parent_d_check" CHECK (d >= '01-01-2001'::date AND d <= '12-31-2099'::date) NOT VALID +Inherits: nv_parent + +-- we leave nv_parent and children around to help test pg_dump logic +-- Foreign key adding test with mixed types +-- Note: these tables are TEMP to avoid name conflicts when this test +-- is run in parallel with foreign_key.sql. +CREATE TEMP TABLE PKTABLE (ptest1 int PRIMARY KEY); +INSERT INTO PKTABLE VALUES(42); +CREATE TEMP TABLE FKTABLE (ftest1 inet); +-- This next should fail, because int=inet does not exist +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer. +-- This should also fail for the same reason, but here we +-- give the column name +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable(ptest1); +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer. +DROP TABLE FKTABLE; +-- This should succeed, even though they are different types, +-- because int=int8 exists and is a member of the integer opfamily +CREATE TEMP TABLE FKTABLE (ftest1 int8); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +-- Check it actually works +INSERT INTO FKTABLE VALUES(42); -- should succeed +INSERT INTO FKTABLE VALUES(43); -- should fail +ERROR: insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey" +DETAIL: Key (ftest1)=(43) is not present in table "pktable". +DROP TABLE FKTABLE; +-- This should fail, because we'd have to cast numeric to int which is +-- not an implicit coercion (or use numeric=numeric, but that's not part +-- of the integer opfamily) +CREATE TEMP TABLE FKTABLE (ftest1 numeric); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: numeric and integer. +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +-- On the other hand, this should work because int implicitly promotes to +-- numeric, and we allow promotion on the FK side +CREATE TEMP TABLE PKTABLE (ptest1 numeric PRIMARY KEY); +INSERT INTO PKTABLE VALUES(42); +CREATE TEMP TABLE FKTABLE (ftest1 int); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +-- Check it actually works +INSERT INTO FKTABLE VALUES(42); -- should succeed +INSERT INTO FKTABLE VALUES(43); -- should fail +ERROR: insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey" +DETAIL: Key (ftest1)=(43) is not present in table "pktable". +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +CREATE TEMP TABLE PKTABLE (ptest1 int, ptest2 inet, + PRIMARY KEY(ptest1, ptest2)); +-- This should fail, because we just chose really odd types +CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) references pktable; +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer. +DROP TABLE FKTABLE; +-- Again, so should this... +CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) + references pktable(ptest1, ptest2); +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer. +DROP TABLE FKTABLE; +-- This fails because we mixed up the column ordering +CREATE TEMP TABLE FKTABLE (ftest1 int, ftest2 inet); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) + references pktable(ptest2, ptest1); +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest2" are of incompatible types: integer and inet. +-- As does this... +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1) + references pktable(ptest1, ptest2); +ERROR: foreign key constraint "fktable_ftest2_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer. +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +-- Test that ALTER CONSTRAINT updates trigger deferrability properly +CREATE TEMP TABLE PKTABLE (ptest1 int primary key); +CREATE TEMP TABLE FKTABLE (ftest1 int); +ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE; +ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE; +SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred +FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint +WHERE tgrelid = 'pktable'::regclass +ORDER BY 1,2,3; + conname | tgfoid | tgtype | tgdeferrable | tginitdeferred +---------+------------------------+--------+--------------+---------------- + fkdd | "RI_FKey_cascade_del" | 9 | f | f + fkdd | "RI_FKey_noaction_upd" | 17 | t | t + fkdd2 | "RI_FKey_cascade_del" | 9 | f | f + fkdd2 | "RI_FKey_noaction_upd" | 17 | t | t + fkdi | "RI_FKey_cascade_del" | 9 | f | f + fkdi | "RI_FKey_noaction_upd" | 17 | t | f + fkdi2 | "RI_FKey_cascade_del" | 9 | f | f + fkdi2 | "RI_FKey_noaction_upd" | 17 | t | f + fknd | "RI_FKey_cascade_del" | 9 | f | f + fknd | "RI_FKey_noaction_upd" | 17 | f | f + fknd2 | "RI_FKey_cascade_del" | 9 | f | f + fknd2 | "RI_FKey_noaction_upd" | 17 | f | f +(12 rows) + +SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred +FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint +WHERE tgrelid = 'fktable'::regclass +ORDER BY 1,2,3; + conname | tgfoid | tgtype | tgdeferrable | tginitdeferred +---------+---------------------+--------+--------------+---------------- + fkdd | "RI_FKey_check_ins" | 5 | t | t + fkdd | "RI_FKey_check_upd" | 17 | t | t + fkdd2 | "RI_FKey_check_ins" | 5 | t | t + fkdd2 | "RI_FKey_check_upd" | 17 | t | t + fkdi | "RI_FKey_check_ins" | 5 | t | f + fkdi | "RI_FKey_check_upd" | 17 | t | f + fkdi2 | "RI_FKey_check_ins" | 5 | t | f + fkdi2 | "RI_FKey_check_upd" | 17 | t | f + fknd | "RI_FKey_check_ins" | 5 | f | f + fknd | "RI_FKey_check_upd" | 17 | f | f + fknd2 | "RI_FKey_check_ins" | 5 | f | f + fknd2 | "RI_FKey_check_upd" | 17 | f | f +(12 rows) + +-- temp tables should go away by themselves, need not drop them. +-- test check constraint adding +create table atacc1 ( test int ); +-- add a check constraint +alter table atacc1 add constraint atacc_test1 check (test>3); +-- should fail +insert into atacc1 (test) values (2); +ERROR: new row for relation "atacc1" violates check constraint "atacc_test1" +DETAIL: Failing row contains (2). +-- should succeed +insert into atacc1 (test) values (4); +drop table atacc1; +-- let's do one where the check fails when added +create table atacc1 ( test int ); +-- insert a soon to be failing row +insert into atacc1 (test) values (2); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test>3); +ERROR: check constraint "atacc_test1" of relation "atacc1" is violated by some row +insert into atacc1 (test) values (4); +drop table atacc1; +-- let's do one where the check fails because the column doesn't exist +create table atacc1 ( test int ); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test1>3); +ERROR: column "test1" does not exist +HINT: Perhaps you meant to reference the column "atacc1.test". +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int, test3 int); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test+test23), test2 int); +alter table atacc1 add check (test2>test); +-- should fail for $2 +insert into atacc1 (test2, test) values (3, 4); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_check" +DETAIL: Failing row contains (4, 3). +drop table atacc1; +-- inheritance related tests +create table atacc1 (test int); +create table atacc2 (test2 int); +create table atacc3 (test3 int) inherits (atacc1, atacc2); +alter table atacc2 add constraint foo check (test2>0); +-- fail and then succeed on atacc2 +insert into atacc2 (test2) values (-3); +ERROR: new row for relation "atacc2" violates check constraint "foo" +DETAIL: Failing row contains (-3). +insert into atacc2 (test2) values (3); +-- fail and then succeed on atacc3 +insert into atacc3 (test2) values (-3); +ERROR: new row for relation "atacc3" violates check constraint "foo" +DETAIL: Failing row contains (null, -3, null). +insert into atacc3 (test2) values (3); +drop table atacc3; +drop table atacc2; +drop table atacc1; +-- same things with one created with INHERIT +create table atacc1 (test int); +create table atacc2 (test2 int); +create table atacc3 (test3 int) inherits (atacc1, atacc2); +alter table atacc3 no inherit atacc2; +-- fail +alter table atacc3 no inherit atacc2; +ERROR: relation "atacc2" is not a parent of relation "atacc3" +-- make sure it really isn't a child +insert into atacc3 (test2) values (3); +select test2 from atacc2; + test2 +------- +(0 rows) + +-- fail due to missing constraint +alter table atacc2 add constraint foo check (test2>0); +alter table atacc3 inherit atacc2; +ERROR: child table is missing constraint "foo" +-- fail due to missing column +alter table atacc3 rename test2 to testx; +alter table atacc3 inherit atacc2; +ERROR: child table is missing column "test2" +-- fail due to mismatched data type +alter table atacc3 add test2 bool; +alter table atacc3 inherit atacc2; +ERROR: child table "atacc3" has different type for column "test2" +alter table atacc3 drop test2; +-- succeed +alter table atacc3 add test2 int; +update atacc3 set test2 = 4 where test2 is null; +alter table atacc3 add constraint foo check (test2>0); +alter table atacc3 inherit atacc2; +-- fail due to duplicates and circular inheritance +alter table atacc3 inherit atacc2; +ERROR: relation "atacc2" would be inherited from more than once +alter table atacc2 inherit atacc3; +ERROR: circular inheritance not allowed +DETAIL: "atacc3" is already a child of "atacc2". +alter table atacc2 inherit atacc2; +ERROR: circular inheritance not allowed +DETAIL: "atacc2" is already a child of "atacc2". +-- test that we really are a child now (should see 4 not 3 and cascade should go through) +select test2 from atacc2; + test2 +------- + 4 +(1 row) + +drop table atacc2 cascade; +NOTICE: drop cascades to table atacc3 +drop table atacc1; +-- adding only to a parent is allowed as of 9.2 +create table atacc1 (test int); +create table atacc2 (test2 int) inherits (atacc1); +-- ok: +alter table atacc1 add constraint foo check (test>0) no inherit; +-- check constraint is not there on child +insert into atacc2 (test) values (-3); +-- check constraint is there on parent +insert into atacc1 (test) values (-3); +ERROR: new row for relation "atacc1" violates check constraint "foo" +DETAIL: Failing row contains (-3). +insert into atacc1 (test) values (3); +-- fail, violating row: +alter table atacc2 add constraint foo check (test>0) no inherit; +ERROR: check constraint "foo" of relation "atacc2" is violated by some row +drop table atacc2; +drop table atacc1; +-- test unique constraint adding +create table atacc1 ( test int ) ; +-- add a unique constraint +alter table atacc1 add constraint atacc_test1 unique (test); +-- insert first value +insert into atacc1 (test) values (2); +-- should fail +insert into atacc1 (test) values (2); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test)=(2) already exists. +-- should succeed +insert into atacc1 (test) values (4); +-- try to create duplicates via alter table using - should fail +alter table atacc1 alter column test type integer using 0; +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(0) is duplicated. +drop table atacc1; +-- let's do one where the unique constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing rows +insert into atacc1 (test) values (2); +insert into atacc1 (test) values (2); +-- add a unique constraint (fails) +alter table atacc1 add constraint atacc_test1 unique (test); +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(2) is duplicated. +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do one where the unique constraint fails +-- because the column doesn't exist +create table atacc1 ( test int ); +-- add a unique constraint (fails) +alter table atacc1 add constraint atacc_test1 unique (test1); +ERROR: column "test1" named in key does not exist +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int); +-- add a unique constraint +alter table atacc1 add constraint atacc_test1 unique (test, test2); +-- insert initial value +insert into atacc1 (test,test2) values (4,4); +-- should fail +insert into atacc1 (test,test2) values (4,4); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test, test2)=(4, 4) already exists. +-- should all succeed +insert into atacc1 (test,test2) values (4,5); +insert into atacc1 (test,test2) values (5,4); +insert into atacc1 (test,test2) values (5,5); +drop table atacc1; +-- lets do some naming tests +create table atacc1 (test int, test2 int, unique(test)); +alter table atacc1 add unique (test2); +-- should fail for @@ second one @@ +insert into atacc1 (test2, test) values (3, 3); +insert into atacc1 (test2, test) values (2, 3); +ERROR: duplicate key value violates unique constraint "atacc1_test_key" +DETAIL: Key (test)=(3) already exists. +drop table atacc1; +-- test primary key constraint adding +create table atacc1 ( id serial, test int) ; +-- add a primary key constraint +alter table atacc1 add constraint atacc_test1 primary key (test); +-- insert first value +insert into atacc1 (test) values (2); +-- should fail +insert into atacc1 (test) values (2); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test)=(2) already exists. +-- should succeed +insert into atacc1 (test) values (4); +-- inserting NULL should fail +insert into atacc1 (test) values(NULL); +ERROR: null value in column "test" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (4, null). +-- try adding a second primary key (should fail) +alter table atacc1 add constraint atacc_oid1 primary key(id); +ERROR: multiple primary keys for table "atacc1" are not allowed +-- drop first primary key constraint +alter table atacc1 drop constraint atacc_test1 restrict; +-- try adding a primary key on oid (should succeed) +alter table atacc1 add constraint atacc_oid1 primary key(id); +drop table atacc1; +-- let's do one where the primary key constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing rows +insert into atacc1 (test) values (2); +insert into atacc1 (test) values (2); +-- add a primary key (fails) +alter table atacc1 add constraint atacc_test1 primary key (test); +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(2) is duplicated. +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do another one where the primary key constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing row +insert into atacc1 (test) values (NULL); +-- add a primary key (fails) +alter table atacc1 add constraint atacc_test1 primary key (test); +ERROR: column "test" of relation "atacc1" contains null values +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do one where the primary key constraint fails +-- because the column doesn't exist +create table atacc1 ( test int ); +-- add a primary key constraint (fails) +alter table atacc1 add constraint atacc_test1 primary key (test1); +ERROR: column "test1" of relation "atacc1" does not exist +drop table atacc1; +-- adding a new column as primary key to a non-empty table. +-- should fail unless the column has a non-null default value. +create table atacc1 ( test int ); +insert into atacc1 (test) values (0); +-- add a primary key column without a default (fails). +alter table atacc1 add column test2 int primary key; +ERROR: column "test2" of relation "atacc1" contains null values +-- now add a primary key column with a default (succeeds). +alter table atacc1 add column test2 int default 0 primary key; +drop table atacc1; +-- this combination used to have order-of-execution problems (bug #15580) +create table atacc1 (a int); +insert into atacc1 values(1); +alter table atacc1 + add column b float8 not null default random(), + add primary key(a); +drop table atacc1; +-- additionally, we've seen issues with foreign key validation not being +-- properly delayed until after a table rewrite. Check that works ok. +create table atacc1 (a int primary key); +alter table atacc1 add constraint atacc1_fkey foreign key (a) references atacc1 (a) not valid; +alter table atacc1 validate constraint atacc1_fkey, alter a type bigint; +drop table atacc1; +-- we've also seen issues with check constraints being validated at the wrong +-- time when there's a pending table rewrite. +create table atacc1 (a bigint, b int); +insert into atacc1 values(1,1); +alter table atacc1 add constraint atacc1_chk check(b = 1) not valid; +alter table atacc1 validate constraint atacc1_chk, alter a type int; +drop table atacc1; +-- same as above, but ensure the constraint violation is detected +create table atacc1 (a bigint, b int); +insert into atacc1 values(1,2); +alter table atacc1 add constraint atacc1_chk check(b = 1) not valid; +alter table atacc1 validate constraint atacc1_chk, alter a type int; +ERROR: check constraint "atacc1_chk" of relation "atacc1" is violated by some row +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int); +-- add a primary key constraint +alter table atacc1 add constraint atacc_test1 primary key (test, test2); +-- try adding a second primary key - should fail +alter table atacc1 add constraint atacc_test2 primary key (test); +ERROR: multiple primary keys for table "atacc1" are not allowed +-- insert initial value +insert into atacc1 (test,test2) values (4,4); +-- should fail +insert into atacc1 (test,test2) values (4,4); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test, test2)=(4, 4) already exists. +insert into atacc1 (test,test2) values (NULL,3); +ERROR: null value in column "test" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (null, 3). +insert into atacc1 (test,test2) values (3, NULL); +ERROR: null value in column "test2" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (3, null). +insert into atacc1 (test,test2) values (NULL,NULL); +ERROR: null value in column "test" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (null, null). +-- should all succeed +insert into atacc1 (test,test2) values (4,5); +insert into atacc1 (test,test2) values (5,4); +insert into atacc1 (test,test2) values (5,5); +drop table atacc1; +-- lets do some naming tests +create table atacc1 (test int, test2 int, primary key(test)); +-- only first should succeed +insert into atacc1 (test2, test) values (3, 3); +insert into atacc1 (test2, test) values (2, 3); +ERROR: duplicate key value violates unique constraint "atacc1_pkey" +DETAIL: Key (test)=(3) already exists. +insert into atacc1 (test2, test) values (1, NULL); +ERROR: null value in column "test" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (null, 1). +drop table atacc1; +-- alter table / alter column [set/drop] not null tests +-- try altering system catalogs, should fail +alter table pg_class alter column relname drop not null; +ERROR: permission denied: "pg_class" is a system catalog +alter table pg_class alter relname set not null; +ERROR: permission denied: "pg_class" is a system catalog +-- try altering non-existent table, should fail +alter table non_existent alter column bar set not null; +ERROR: relation "non_existent" does not exist +alter table non_existent alter column bar drop not null; +ERROR: relation "non_existent" does not exist +-- test setting columns to null and not null and vice versa +-- test checking for null values and primary key +create table atacc1 (test int not null); +alter table atacc1 add constraint "atacc1_pkey" primary key (test); +alter table atacc1 alter column test drop not null; +ERROR: column "test" is in a primary key +alter table atacc1 drop constraint "atacc1_pkey"; +alter table atacc1 alter column test drop not null; +insert into atacc1 values (null); +alter table atacc1 alter test set not null; +ERROR: column "test" of relation "atacc1" contains null values +delete from atacc1; +alter table atacc1 alter test set not null; +-- try altering a non-existent column, should fail +alter table atacc1 alter bar set not null; +ERROR: column "bar" of relation "atacc1" does not exist +alter table atacc1 alter bar drop not null; +ERROR: column "bar" of relation "atacc1" does not exist +-- try creating a view and altering that, should fail +create view myview as select * from atacc1; +alter table myview alter column test drop not null; +ERROR: "myview" is not a table or foreign table +alter table myview alter column test set not null; +ERROR: "myview" is not a table or foreign table +drop view myview; +drop table atacc1; +-- set not null verified by constraints +create table atacc1 (test_a int, test_b int); +insert into atacc1 values (null, 1); +-- constraint not cover all values, should fail +alter table atacc1 add constraint atacc1_constr_or check(test_a is not null or test_b < 10); +alter table atacc1 alter test_a set not null; +ERROR: column "test_a" of relation "atacc1" contains null values +alter table atacc1 drop constraint atacc1_constr_or; +-- not valid constraint, should fail +alter table atacc1 add constraint atacc1_constr_invalid check(test_a is not null) not valid; +alter table atacc1 alter test_a set not null; +ERROR: column "test_a" of relation "atacc1" contains null values +alter table atacc1 drop constraint atacc1_constr_invalid; +-- with valid constraint +update atacc1 set test_a = 1; +alter table atacc1 add constraint atacc1_constr_a_valid check(test_a is not null); +alter table atacc1 alter test_a set not null; +delete from atacc1; +insert into atacc1 values (2, null); +alter table atacc1 alter test_a drop not null; +-- test multiple set not null at same time +-- test_a checked by atacc1_constr_a_valid, test_b should fail by table scan +alter table atacc1 alter test_a set not null, alter test_b set not null; +ERROR: column "test_b" of relation "atacc1" contains null values +-- commands order has no importance +alter table atacc1 alter test_b set not null, alter test_a set not null; +ERROR: column "test_b" of relation "atacc1" contains null values +-- valid one by table scan, one by check constraints +update atacc1 set test_b = 1; +alter table atacc1 alter test_b set not null, alter test_a set not null; +alter table atacc1 alter test_a drop not null, alter test_b drop not null; +-- both column has check constraints +alter table atacc1 add constraint atacc1_constr_b_valid check(test_b is not null); +alter table atacc1 alter test_b set not null, alter test_a set not null; +drop table atacc1; +-- test inheritance +create table parent (a int); +create table child (b varchar(255)) inherits (parent); +alter table parent alter a set not null; +insert into parent values (NULL); +ERROR: null value in column "a" of relation "parent" violates not-null constraint +DETAIL: Failing row contains (null). +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" of relation "child" violates not-null constraint +DETAIL: Failing row contains (null, foo). +alter table parent alter a drop not null; +insert into parent values (NULL); +insert into child (a, b) values (NULL, 'foo'); +alter table only parent alter a set not null; +ERROR: column "a" of relation "parent" contains null values +alter table child alter a set not null; +ERROR: column "a" of relation "child" contains null values +delete from parent; +alter table only parent alter a set not null; +insert into parent values (NULL); +ERROR: null value in column "a" of relation "parent" violates not-null constraint +DETAIL: Failing row contains (null). +alter table child alter a set not null; +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" of relation "child" violates not-null constraint +DETAIL: Failing row contains (null, foo). +delete from child; +alter table child alter a set not null; +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" of relation "child" violates not-null constraint +DETAIL: Failing row contains (null, foo). +drop table child; +drop table parent; +-- test setting and removing default values +create table def_test ( + c1 int4 default 5, + c2 text default 'initial_default' +); +insert into def_test default values; +alter table def_test alter column c1 drop default; +insert into def_test default values; +alter table def_test alter column c2 drop default; +insert into def_test default values; +alter table def_test alter column c1 set default 10; +alter table def_test alter column c2 set default 'new_default'; +insert into def_test default values; +select * from def_test; + c1 | c2 +----+----------------- + 5 | initial_default + | initial_default + | + 10 | new_default +(4 rows) + +-- set defaults to an incorrect type: this should fail +alter table def_test alter column c1 set default 'wrong_datatype'; +ERROR: invalid input syntax for type integer: "wrong_datatype" +alter table def_test alter column c2 set default 20; +-- set defaults on a non-existent column: this should fail +alter table def_test alter column c3 set default 30; +ERROR: column "c3" of relation "def_test" does not exist +-- set defaults on views: we need to create a view, add a rule +-- to allow insertions into it, and then alter the view to add +-- a default +create view def_view_test as select * from def_test; +create rule def_view_test_ins as + on insert to def_view_test + do instead insert into def_test select new.*; +insert into def_view_test default values; +alter table def_view_test alter column c1 set default 45; +insert into def_view_test default values; +alter table def_view_test alter column c2 set default 'view_default'; +insert into def_view_test default values; +select * from def_view_test; + c1 | c2 +----+----------------- + 5 | initial_default + | initial_default + | + 10 | new_default + | + 45 | + 45 | view_default +(7 rows) + +drop rule def_view_test_ins on def_view_test; +drop view def_view_test; +drop table def_test; +-- alter table / drop column tests +-- try altering system catalogs, should fail +alter table pg_class drop column relname; +ERROR: permission denied: "pg_class" is a system catalog +-- try altering non-existent table, should fail +alter table nosuchtable drop column bar; +ERROR: relation "nosuchtable" does not exist +-- test dropping columns +create table atacc1 (a int4 not null, b int4, c int4 not null, d int4); +insert into atacc1 values (1, 2, 3, 4); +alter table atacc1 drop a; +alter table atacc1 drop a; +ERROR: column "a" of relation "atacc1" does not exist +-- SELECTs +select * from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select * from atacc1 order by a; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 order by a; + ^ +select * from atacc1 order by "........pg.dropped.1........"; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 order by "........pg.dropped.1........"... + ^ +select * from atacc1 group by a; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 group by a; + ^ +select * from atacc1 group by "........pg.dropped.1........"; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 group by "........pg.dropped.1........"... + ^ +select atacc1.* from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select a from atacc1; +ERROR: column "a" does not exist +LINE 1: select a from atacc1; + ^ +select atacc1.a from atacc1; +ERROR: column atacc1.a does not exist +LINE 1: select atacc1.a from atacc1; + ^ +select b,c,d from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select a,b,c,d from atacc1; +ERROR: column "a" does not exist +LINE 1: select a,b,c,d from atacc1; + ^ +select * from atacc1 where a = 1; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 where a = 1; + ^ +select "........pg.dropped.1........" from atacc1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select "........pg.dropped.1........" from atacc1; + ^ +select atacc1."........pg.dropped.1........" from atacc1; +ERROR: column atacc1.........pg.dropped.1........ does not exist +LINE 1: select atacc1."........pg.dropped.1........" from atacc1; + ^ +select "........pg.dropped.1........",b,c,d from atacc1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select "........pg.dropped.1........",b,c,d from atacc1; + ^ +select * from atacc1 where "........pg.dropped.1........" = 1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 where "........pg.dropped.1........" = ... + ^ +-- UPDATEs +update atacc1 set a = 3; +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: update atacc1 set a = 3; + ^ +update atacc1 set b = 2 where a = 3; +ERROR: column "a" does not exist +LINE 1: update atacc1 set b = 2 where a = 3; + ^ +update atacc1 set "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: update atacc1 set "........pg.dropped.1........" = 3; + ^ +update atacc1 set b = 2 where "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: update atacc1 set b = 2 where "........pg.dropped.1........"... + ^ +-- INSERTs +insert into atacc1 values (10, 11, 12, 13); +ERROR: INSERT has more expressions than target columns +LINE 1: insert into atacc1 values (10, 11, 12, 13); + ^ +insert into atacc1 values (default, 11, 12, 13); +ERROR: INSERT has more expressions than target columns +LINE 1: insert into atacc1 values (default, 11, 12, 13); + ^ +insert into atacc1 values (11, 12, 13); +insert into atacc1 (a) values (10); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a) values (10); + ^ +insert into atacc1 (a) values (default); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a) values (default); + ^ +insert into atacc1 (a,b,c,d) values (10,11,12,13); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a,b,c,d) values (10,11,12,13); + ^ +insert into atacc1 (a,b,c,d) values (default,11,12,13); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a,b,c,d) values (default,11,12,13); + ^ +insert into atacc1 (b,c,d) values (11,12,13); +insert into atacc1 ("........pg.dropped.1........") values (10); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........") values (... + ^ +insert into atacc1 ("........pg.dropped.1........") values (default); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........") values (... + ^ +insert into atacc1 ("........pg.dropped.1........",b,c,d) values (10,11,12,13); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va... + ^ +insert into atacc1 ("........pg.dropped.1........",b,c,d) values (default,11,12,13); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va... + ^ +-- DELETEs +delete from atacc1 where a = 3; +ERROR: column "a" does not exist +LINE 1: delete from atacc1 where a = 3; + ^ +delete from atacc1 where "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: delete from atacc1 where "........pg.dropped.1........" = 3; + ^ +delete from atacc1; +-- try dropping a non-existent column, should fail +alter table atacc1 drop bar; +ERROR: column "bar" of relation "atacc1" does not exist +-- try removing an oid column, should succeed (as it's nonexistent) +alter table atacc1 SET WITHOUT OIDS; +-- try adding an oid column, should fail (not supported) +alter table atacc1 SET WITH OIDS; +ERROR: syntax error at or near "WITH" +LINE 1: alter table atacc1 SET WITH OIDS; + ^ +-- try dropping the xmin column, should fail +alter table atacc1 drop xmin; +ERROR: cannot drop system column "xmin" +-- try creating a view and altering that, should fail +create view myview as select * from atacc1; +select * from myview; + b | c | d +---+---+--- +(0 rows) + +alter table myview drop d; +ERROR: "myview" is not a table, composite type, or foreign table +drop view myview; +-- test some commands to make sure they fail on the dropped column +analyze atacc1(a); +ERROR: column "a" of relation "atacc1" does not exist +analyze atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +vacuum analyze atacc1(a); +ERROR: column "a" of relation "atacc1" does not exist +vacuum analyze atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +comment on column atacc1.a is 'testing'; +ERROR: column "a" of relation "atacc1" does not exist +comment on column atacc1."........pg.dropped.1........" is 'testing'; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set storage plain; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set storage plain; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set statistics 0; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set statistics 0; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set default 3; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set default 3; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a drop default; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" drop default; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set not null; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set not null; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a drop not null; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" drop not null; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 rename a to x; +ERROR: column "a" does not exist +alter table atacc1 rename "........pg.dropped.1........" to x; +ERROR: column "........pg.dropped.1........" does not exist +alter table atacc1 add primary key(a); +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 add primary key("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 add unique(a); +ERROR: column "a" named in key does not exist +alter table atacc1 add unique("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" named in key does not exist +alter table atacc1 add check (a > 3); +ERROR: column "a" does not exist +alter table atacc1 add check ("........pg.dropped.1........" > 3); +ERROR: column "........pg.dropped.1........" does not exist +create table atacc2 (id int4 unique); +alter table atacc1 add foreign key (a) references atacc2(id); +ERROR: column "a" referenced in foreign key constraint does not exist +alter table atacc1 add foreign key ("........pg.dropped.1........") references atacc2(id); +ERROR: column "........pg.dropped.1........" referenced in foreign key constraint does not exist +alter table atacc2 add foreign key (id) references atacc1(a); +ERROR: column "a" referenced in foreign key constraint does not exist +alter table atacc2 add foreign key (id) references atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" referenced in foreign key constraint does not exist +drop table atacc2; +create index "testing_idx" on atacc1(a); +ERROR: column "a" does not exist +create index "testing_idx" on atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" does not exist +-- test create as and select into +insert into atacc1 values (21, 22, 23); +create table attest1 as select * from atacc1; +select * from attest1; + b | c | d +----+----+---- + 21 | 22 | 23 +(1 row) + +drop table attest1; +select * into attest2 from atacc1; +select * from attest2; + b | c | d +----+----+---- + 21 | 22 | 23 +(1 row) + +drop table attest2; +-- try dropping all columns +alter table atacc1 drop c; +alter table atacc1 drop d; +alter table atacc1 drop b; +select * from atacc1; +-- +(1 row) + +drop table atacc1; +-- test constraint error reporting in presence of dropped columns +create table atacc1 (id serial primary key, value int check (value < 10)); +insert into atacc1(value) values (100); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_value_check" +DETAIL: Failing row contains (1, 100). +alter table atacc1 drop column value; +alter table atacc1 add column value int check (value < 10); +insert into atacc1(value) values (100); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_value_check" +DETAIL: Failing row contains (2, 100). +insert into atacc1(id, value) values (null, 0); +ERROR: null value in column "id" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (null, 0). +drop table atacc1; +-- test inheritance +create table parent (a int, b int, c int); +insert into parent values (1, 2, 3); +alter table parent drop a; +create table child (d varchar(255)) inherits (parent); +insert into child values (12, 13, 'testing'); +select * from parent; + b | c +----+---- + 2 | 3 + 12 | 13 +(2 rows) + +select * from child; + b | c | d +----+----+--------- + 12 | 13 | testing +(1 row) + +alter table parent drop c; +select * from parent; + b +---- + 2 + 12 +(2 rows) + +select * from child; + b | d +----+--------- + 12 | testing +(1 row) + +drop table child; +drop table parent; +-- check error cases for inheritance column merging +create table parent (a float8, b numeric(10,4), c text collate "C"); +create table child (a float4) inherits (parent); -- fail +NOTICE: merging column "a" with inherited definition +ERROR: column "a" has a type conflict +DETAIL: double precision versus real +create table child (b decimal(10,7)) inherits (parent); -- fail +NOTICE: moving and merging column "b" with inherited definition +DETAIL: User-specified column moved to the position of the inherited column. +ERROR: column "b" has a type conflict +DETAIL: numeric(10,4) versus numeric(10,7) +create table child (c text collate "POSIX") inherits (parent); -- fail +NOTICE: moving and merging column "c" with inherited definition +DETAIL: User-specified column moved to the position of the inherited column. +ERROR: column "c" has a collation conflict +DETAIL: "C" versus "POSIX" +create table child (a double precision, b decimal(10,4)) inherits (parent); +NOTICE: merging column "a" with inherited definition +NOTICE: merging column "b" with inherited definition +drop table child; +drop table parent; +-- test copy in/out +create table attest (a int4, b int4, c int4); +insert into attest values (1,2,3); +alter table attest drop a; +copy attest to stdout; +2 3 +copy attest(a) to stdout; +ERROR: column "a" of relation "attest" does not exist +copy attest("........pg.dropped.1........") to stdout; +ERROR: column "........pg.dropped.1........" of relation "attest" does not exist +copy attest from stdin; +ERROR: extra data after last expected column +CONTEXT: COPY attest, line 1: "10 11 12" +select * from attest; + b | c +---+--- + 2 | 3 +(1 row) + +copy attest from stdin; +select * from attest; + b | c +----+---- + 2 | 3 + 21 | 22 +(2 rows) + +copy attest(a) from stdin; +ERROR: column "a" of relation "attest" does not exist +copy attest("........pg.dropped.1........") from stdin; +ERROR: column "........pg.dropped.1........" of relation "attest" does not exist +copy attest(b,c) from stdin; +select * from attest; + b | c +----+---- + 2 | 3 + 21 | 22 + 31 | 32 +(3 rows) + +drop table attest; +-- test inheritance +create table dropColumn (a int, b int, e int); +create table dropColumnChild (c int) inherits (dropColumn); +create table dropColumnAnother (d int) inherits (dropColumnChild); +-- these two should fail +alter table dropColumnchild drop column a; +ERROR: cannot drop inherited column "a" +alter table only dropColumnChild drop column b; +ERROR: cannot drop inherited column "b" +-- these three should work +alter table only dropColumn drop column e; +alter table dropColumnChild drop column c; +alter table dropColumn drop column a; +create table renameColumn (a int); +create table renameColumnChild (b int) inherits (renameColumn); +create table renameColumnAnother (c int) inherits (renameColumnChild); +-- these three should fail +alter table renameColumnChild rename column a to d; +ERROR: cannot rename inherited column "a" +alter table only renameColumnChild rename column a to d; +ERROR: inherited column "a" must be renamed in child tables too +alter table only renameColumn rename column a to d; +ERROR: inherited column "a" must be renamed in child tables too +-- these should work +alter table renameColumn rename column a to d; +alter table renameColumnChild rename column b to a; +-- these should work +alter table if exists doesnt_exist_tab rename column a to d; +NOTICE: relation "doesnt_exist_tab" does not exist, skipping +alter table if exists doesnt_exist_tab rename column b to a; +NOTICE: relation "doesnt_exist_tab" does not exist, skipping +-- this should work +alter table renameColumn add column w int; +-- this should fail +alter table only renameColumn add column x int; +ERROR: column must be added to child tables too +-- Test corner cases in dropping of inherited columns +create table p1 (f1 int, f2 int); +create table c1 (f1 int not null) inherits(p1); +NOTICE: merging column "f1" with inherited definition +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +-- should work +alter table p1 drop column f1; +-- c1.f1 is still there, but no longer inherited +select f1 from c1; + f1 +---- +(0 rows) + +alter table c1 drop column f1; +select f1 from c1; +ERROR: column "f1" does not exist +LINE 1: select f1 from c1; + ^ +HINT: Perhaps you meant to reference the column "c1.f2". +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 () inherits(p1); +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table p1 drop column f1; +-- c1.f1 is dropped now, since there is no local definition for it +select f1 from c1; +ERROR: column "f1" does not exist +LINE 1: select f1 from c1; + ^ +HINT: Perhaps you meant to reference the column "c1.f2". +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 () inherits(p1); +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table only p1 drop column f1; +-- c1.f1 is NOT dropped, but must now be considered non-inherited +alter table c1 drop column f1; +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 (f1 int not null) inherits(p1); +NOTICE: merging column "f1" with inherited definition +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table only p1 drop column f1; +-- c1.f1 is still there, but no longer inherited +alter table c1 drop column f1; +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1(id int, name text); +create table p2(id2 int, name text, height int); +create table c1(age int) inherits(p1,p2); +NOTICE: merging multiple inherited definitions of column "name" +create table gc1() inherits (c1); +select relname, attname, attinhcount, attislocal +from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid) +where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped +order by relname, attnum; + relname | attname | attinhcount | attislocal +---------+---------+-------------+------------ + c1 | id | 1 | f + c1 | name | 2 | f + c1 | id2 | 1 | f + c1 | height | 1 | f + c1 | age | 0 | t + gc1 | id | 1 | f + gc1 | name | 1 | f + gc1 | id2 | 1 | f + gc1 | height | 1 | f + gc1 | age | 1 | f + p1 | id | 0 | t + p1 | name | 0 | t + p2 | id2 | 0 | t + p2 | name | 0 | t + p2 | height | 0 | t +(15 rows) + +-- should work +alter table only p1 drop column name; +-- should work. Now c1.name is local and inhcount is 0. +alter table p2 drop column name; +-- should be rejected since its inherited +alter table gc1 drop column name; +ERROR: cannot drop inherited column "name" +-- should work, and drop gc1.name along +alter table c1 drop column name; +-- should fail: column does not exist +alter table gc1 drop column name; +ERROR: column "name" of relation "gc1" does not exist +-- should work and drop the attribute in all tables +alter table p2 drop column height; +-- IF EXISTS test +create table dropColumnExists (); +alter table dropColumnExists drop column non_existing; --fail +ERROR: column "non_existing" of relation "dropcolumnexists" does not exist +alter table dropColumnExists drop column if exists non_existing; --succeed +NOTICE: column "non_existing" of relation "dropcolumnexists" does not exist, skipping +select relname, attname, attinhcount, attislocal +from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid) +where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped +order by relname, attnum; + relname | attname | attinhcount | attislocal +---------+---------+-------------+------------ + c1 | id | 1 | f + c1 | id2 | 1 | f + c1 | age | 0 | t + gc1 | id | 1 | f + gc1 | id2 | 1 | f + gc1 | age | 1 | f + p1 | id | 0 | t + p2 | id2 | 0 | t +(8 rows) + +drop table p1, p2 cascade; +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to table c1 +drop cascades to table gc1 +-- test attinhcount tracking with merged columns +create table depth0(); +create table depth1(c text) inherits (depth0); +create table depth2() inherits (depth1); +alter table depth0 add c text; +NOTICE: merging definition of column "c" for child "depth1" +select attrelid::regclass, attname, attinhcount, attislocal +from pg_attribute +where attnum > 0 and attrelid::regclass in ('depth0', 'depth1', 'depth2') +order by attrelid::regclass::text, attnum; + attrelid | attname | attinhcount | attislocal +----------+---------+-------------+------------ + depth0 | c | 0 | t + depth1 | c | 1 | t + depth2 | c | 1 | f +(3 rows) + +-- test renumbering of child-table columns in inherited operations +create table p1 (f1 int); +create table c1 (f2 text, f3 int) inherits (p1); +alter table p1 add column a1 int check (a1 > 0); +alter table p1 add column f2 text; +NOTICE: merging definition of column "f2" for child "c1" +insert into p1 values (1,2,'abc'); +insert into c1 values(11,'xyz',33,0); -- should fail +ERROR: new row for relation "c1" violates check constraint "p1_a1_check" +DETAIL: Failing row contains (11, xyz, 33, 0). +insert into c1 values(11,'xyz',33,22); +select * from p1; + f1 | a1 | f2 +----+----+----- + 1 | 2 | abc + 11 | 22 | xyz +(2 rows) + +update p1 set a1 = a1 + 1, f2 = upper(f2); +select * from p1; + f1 | a1 | f2 +----+----+----- + 1 | 3 | ABC + 11 | 23 | XYZ +(2 rows) + +drop table p1 cascade; +NOTICE: drop cascades to table c1 +-- test that operations with a dropped column do not try to reference +-- its datatype +create domain mytype as text; +create temp table foo (f1 text, f2 mytype, f3 text); +insert into foo values('bb','cc','dd'); +select * from foo; + f1 | f2 | f3 +----+----+---- + bb | cc | dd +(1 row) + +drop domain mytype cascade; +NOTICE: drop cascades to column f2 of table foo +select * from foo; + f1 | f3 +----+---- + bb | dd +(1 row) + +insert into foo values('qq','rr'); +select * from foo; + f1 | f3 +----+---- + bb | dd + qq | rr +(2 rows) + +update foo set f3 = 'zz'; +select * from foo; + f1 | f3 +----+---- + bb | zz + qq | zz +(2 rows) + +select f3,max(f1) from foo group by f3; + f3 | max +----+----- + zz | qq +(1 row) + +-- Simple tests for alter table column type +alter table foo alter f1 TYPE integer; -- fails +ERROR: column "f1" cannot be cast automatically to type integer +HINT: You might need to specify "USING f1::integer". +alter table foo alter f1 TYPE varchar(10); +create table anothertab (atcol1 serial8, atcol2 boolean, + constraint anothertab_chk check (atcol1 <= 3)); +insert into anothertab (atcol1, atcol2) values (default, true); +insert into anothertab (atcol1, atcol2) values (default, false); +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f +(2 rows) + +alter table anothertab alter column atcol1 type boolean; -- fails +ERROR: column "atcol1" cannot be cast automatically to type boolean +HINT: You might need to specify "USING atcol1::boolean". +alter table anothertab alter column atcol1 type boolean using atcol1::int; -- fails +ERROR: result of USING clause for column "atcol1" cannot be cast automatically to type boolean +HINT: You might need to add an explicit cast. +alter table anothertab alter column atcol1 type integer; +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f +(2 rows) + +insert into anothertab (atcol1, atcol2) values (45, null); -- fails +ERROR: new row for relation "anothertab" violates check constraint "anothertab_chk" +DETAIL: Failing row contains (45, null). +insert into anothertab (atcol1, atcol2) values (default, null); +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f + 3 | +(3 rows) + +alter table anothertab alter column atcol2 type text + using case when atcol2 is true then 'IT WAS TRUE' + when atcol2 is false then 'IT WAS FALSE' + else 'IT WAS NULL!' end; +select * from anothertab; + atcol1 | atcol2 +--------+-------------- + 1 | IT WAS TRUE + 2 | IT WAS FALSE + 3 | IT WAS NULL! +(3 rows) + +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; -- fails +ERROR: default for column "atcol1" cannot be cast automatically to type boolean +alter table anothertab alter column atcol1 drop default; +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; -- fails +ERROR: operator does not exist: boolean <= integer +HINT: No operator matches the given name and argument types. You might need to add explicit type casts. +alter table anothertab drop constraint anothertab_chk; +alter table anothertab drop constraint anothertab_chk; -- fails +ERROR: constraint "anothertab_chk" of relation "anothertab" does not exist +alter table anothertab drop constraint IF EXISTS anothertab_chk; -- succeeds +NOTICE: constraint "anothertab_chk" of relation "anothertab" does not exist, skipping +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; +select * from anothertab; + atcol1 | atcol2 +--------+-------------- + f | IT WAS TRUE + t | IT WAS FALSE + f | IT WAS NULL! +(3 rows) + +drop table anothertab; +-- Test index handling in alter table column type (cf. bugs #15835, #15865) +create table anothertab(f1 int primary key, f2 int unique, + f3 int, f4 int, f5 int); +alter table anothertab + add exclude using btree (f3 with =); +alter table anothertab + add exclude using btree (f4 with =) where (f4 is not null); +alter table anothertab + add exclude using btree (f4 with =) where (f5 > 0); +alter table anothertab + add unique(f1,f4); +create index on anothertab(f2,f3); +create unique index on anothertab(f4); +\d anothertab + Table "public.anothertab" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + f1 | integer | | not null | + f2 | integer | | | + f3 | integer | | | + f4 | integer | | | + f5 | integer | | | +Indexes: + "anothertab_pkey" PRIMARY KEY, btree (f1) + "anothertab_f1_f4_key" UNIQUE CONSTRAINT, btree (f1, f4) + "anothertab_f2_f3_idx" btree (f2, f3) + "anothertab_f2_key" UNIQUE CONSTRAINT, btree (f2) + "anothertab_f3_excl" EXCLUDE USING btree (f3 WITH =) + "anothertab_f4_excl" EXCLUDE USING btree (f4 WITH =) WHERE (f4 IS NOT NULL) + "anothertab_f4_excl1" EXCLUDE USING btree (f4 WITH =) WHERE (f5 > 0) + "anothertab_f4_idx" UNIQUE, btree (f4) + +alter table anothertab alter column f1 type bigint; +alter table anothertab + alter column f2 type bigint, + alter column f3 type bigint, + alter column f4 type bigint; +alter table anothertab alter column f5 type bigint; +\d anothertab + Table "public.anothertab" + Column | Type | Collation | Nullable | Default +--------+--------+-----------+----------+--------- + f1 | bigint | | not null | + f2 | bigint | | | + f3 | bigint | | | + f4 | bigint | | | + f5 | bigint | | | +Indexes: + "anothertab_pkey" PRIMARY KEY, btree (f1) + "anothertab_f1_f4_key" UNIQUE CONSTRAINT, btree (f1, f4) + "anothertab_f2_f3_idx" btree (f2, f3) + "anothertab_f2_key" UNIQUE CONSTRAINT, btree (f2) + "anothertab_f3_excl" EXCLUDE USING btree (f3 WITH =) + "anothertab_f4_excl" EXCLUDE USING btree (f4 WITH =) WHERE (f4 IS NOT NULL) + "anothertab_f4_excl1" EXCLUDE USING btree (f4 WITH =) WHERE (f5 > 0) + "anothertab_f4_idx" UNIQUE, btree (f4) + +drop table anothertab; +-- test that USING expressions are parsed before column alter type / drop steps +create table another (f1 int, f2 text, f3 text); +insert into another values(1, 'one', 'uno'); +insert into another values(2, 'two', 'due'); +insert into another values(3, 'three', 'tre'); +select * from another; + f1 | f2 | f3 +----+-------+----- + 1 | one | uno + 2 | two | due + 3 | three | tre +(3 rows) + +alter table another + alter f1 type text using f2 || ' and ' || f3 || ' more', + alter f2 type bigint using f1 * 10, + drop column f3; +select * from another; + f1 | f2 +--------------------+---- + one and uno more | 10 + two and due more | 20 + three and tre more | 30 +(3 rows) + +drop table another; +-- Create an index that skips WAL, then perform a SET DATA TYPE that skips +-- rewriting the index. +begin; +create table skip_wal_skip_rewrite_index (c varchar(10) primary key); +alter table skip_wal_skip_rewrite_index alter c type varchar(20); +commit; +-- table's row type +create table tab1 (a int, b text); +create table tab2 (x int, y tab1); +alter table tab1 alter column b type varchar; -- fails +ERROR: cannot alter table "tab1" because column "tab2.y" uses its row type +-- Alter column type that's part of a partitioned index +create table at_partitioned (a int, b text) partition by range (a); +create table at_part_1 partition of at_partitioned for values from (0) to (1000); +insert into at_partitioned values (512, '0.123'); +create table at_part_2 (b text, a int); +insert into at_part_2 values ('1.234', 1024); +create index on at_partitioned (b); +create index on at_partitioned (a); +\d at_part_1 + Table "public.at_part_1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | +Partition of: at_partitioned FOR VALUES FROM (0) TO (1000) +Indexes: + "at_part_1_a_idx" btree (a) + "at_part_1_b_idx" btree (b) + +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | text | | | + a | integer | | | + +alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000); +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | text | | | + a | integer | | | +Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) +Indexes: + "at_part_2_a_idx" btree (a) + "at_part_2_b_idx" btree (b) + +alter table at_partitioned alter column b type numeric using b::numeric; +\d at_part_1 + Table "public.at_part_1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | numeric | | | +Partition of: at_partitioned FOR VALUES FROM (0) TO (1000) +Indexes: + "at_part_1_a_idx" btree (a) + "at_part_1_b_idx" btree (b) + +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | numeric | | | + a | integer | | | +Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) +Indexes: + "at_part_2_a_idx" btree (a) + "at_part_2_b_idx" btree (b) + +drop table at_partitioned; +-- Alter column type when no table rewrite is required +-- Also check that comments are preserved +create table at_partitioned(id int, name varchar(64), unique (id, name)) + partition by hash(id); +comment on constraint at_partitioned_id_name_key on at_partitioned is 'parent constraint'; +comment on index at_partitioned_id_name_key is 'parent index'; +create table at_partitioned_0 partition of at_partitioned + for values with (modulus 2, remainder 0); +comment on constraint at_partitioned_0_id_name_key on at_partitioned_0 is 'child 0 constraint'; +comment on index at_partitioned_0_id_name_key is 'child 0 index'; +create table at_partitioned_1 partition of at_partitioned + for values with (modulus 2, remainder 1); +comment on constraint at_partitioned_1_id_name_key on at_partitioned_1 is 'child 1 constraint'; +comment on index at_partitioned_1_id_name_key is 'child 1 index'; +insert into at_partitioned values(1, 'foo'); +insert into at_partitioned values(3, 'bar'); +create temp table old_oids as + select relname, oid as oldoid, relfilenode as oldfilenode + from pg_class where relname like 'at_partitioned%'; +select relname, + c.oid = oldoid as orig_oid, + case relfilenode + when 0 then 'none' + when c.oid then 'own' + when oldfilenode then 'orig' + else 'OTHER' + end as storage, + obj_description(c.oid, 'pg_class') as desc + from pg_class c left join old_oids using (relname) + where relname like 'at_partitioned%' + order by relname; + relname | orig_oid | storage | desc +------------------------------+----------+---------+--------------- + at_partitioned | t | none | + at_partitioned_0 | t | own | + at_partitioned_0_id_name_key | t | own | child 0 index + at_partitioned_1 | t | own | + at_partitioned_1_id_name_key | t | own | child 1 index + at_partitioned_id_name_key | t | none | parent index +(6 rows) + +select conname, obj_description(oid, 'pg_constraint') as desc + from pg_constraint where conname like 'at_partitioned%' + order by conname; + conname | desc +------------------------------+-------------------- + at_partitioned_0_id_name_key | child 0 constraint + at_partitioned_1_id_name_key | child 1 constraint + at_partitioned_id_name_key | parent constraint +(3 rows) + +alter table at_partitioned alter column name type varchar(127); +-- Note: these tests currently show the wrong behavior for comments :-( +select relname, + c.oid = oldoid as orig_oid, + case relfilenode + when 0 then 'none' + when c.oid then 'own' + when oldfilenode then 'orig' + else 'OTHER' + end as storage, + obj_description(c.oid, 'pg_class') as desc + from pg_class c left join old_oids using (relname) + where relname like 'at_partitioned%' + order by relname; + relname | orig_oid | storage | desc +------------------------------+----------+---------+-------------- + at_partitioned | t | none | + at_partitioned_0 | t | own | + at_partitioned_0_id_name_key | f | own | parent index + at_partitioned_1 | t | own | + at_partitioned_1_id_name_key | f | own | parent index + at_partitioned_id_name_key | f | none | parent index +(6 rows) + +select conname, obj_description(oid, 'pg_constraint') as desc + from pg_constraint where conname like 'at_partitioned%' + order by conname; + conname | desc +------------------------------+------------------- + at_partitioned_0_id_name_key | + at_partitioned_1_id_name_key | + at_partitioned_id_name_key | parent constraint +(3 rows) + +-- Don't remove this DROP, it exposes bug #15672 +drop table at_partitioned; +-- disallow recursive containment of row types +create temp table recur1 (f1 int); +alter table recur1 add column f2 recur1; -- fails +ERROR: composite type recur1 cannot be made a member of itself +alter table recur1 add column f2 recur1[]; -- fails +ERROR: composite type recur1 cannot be made a member of itself +create domain array_of_recur1 as recur1[]; +alter table recur1 add column f2 array_of_recur1; -- fails +ERROR: composite type recur1 cannot be made a member of itself +create temp table recur2 (f1 int, f2 recur1); +alter table recur1 add column f2 recur2; -- fails +ERROR: composite type recur1 cannot be made a member of itself +alter table recur1 add column f2 int; +alter table recur1 alter column f2 type recur2; -- fails +ERROR: composite type recur1 cannot be made a member of itself +-- SET STORAGE may need to add a TOAST table +create table test_storage (a text); +alter table test_storage alter a set storage plain; +alter table test_storage add b int default 0; -- rewrite table to remove its TOAST table +alter table test_storage alter a set storage extended; -- re-add TOAST table +select reltoastrelid <> 0 as has_toast_table +from pg_class +where oid = 'test_storage'::regclass; + has_toast_table +----------------- + t +(1 row) + +-- test that SET STORAGE propagates to index correctly +create index test_storage_idx on test_storage (b, a); +alter table test_storage alter column a set storage external; +\d+ test_storage + Table "public.test_storage" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | external | | + b | integer | | | 0 | plain | | +Indexes: + "test_storage_idx" btree (b, a) + +\d+ test_storage_idx + Index "public.test_storage_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+---------+------+------------+----------+-------------- + b | integer | yes | b | plain | + a | text | yes | a | external | +btree, for table "public.test_storage" + +-- ALTER COLUMN TYPE with a check constraint and a child table (bug #13779) +CREATE TABLE test_inh_check (a float check (a > 10.2), b float); +CREATE TABLE test_inh_check_child() INHERITS(test_inh_check); +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | double precision | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | double precision | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(2 rows) + +ALTER TABLE test_inh_check ALTER COLUMN a TYPE numeric; +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(2 rows) + +-- also try noinherit, local, and local+inherited cases +ALTER TABLE test_inh_check ADD CONSTRAINT bnoinherit CHECK (b > 100) NO INHERIT; +ALTER TABLE test_inh_check_child ADD CONSTRAINT blocal CHECK (b < 1000); +ALTER TABLE test_inh_check_child ADD CONSTRAINT bmerged CHECK (b > 1); +ALTER TABLE test_inh_check ADD CONSTRAINT bmerged CHECK (b > 1); +NOTICE: merging constraint "bmerged" with inherited definition +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "bmerged" CHECK (b > 1::double precision) + "bnoinherit" CHECK (b > 100::double precision) NO INHERIT + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "blocal" CHECK (b < 1000::double precision) + "bmerged" CHECK (b > 1::double precision) + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | bmerged | 0 | t | f + test_inh_check | bnoinherit | 0 | t | t + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | blocal | 0 | t | f + test_inh_check_child | bmerged | 1 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(6 rows) + +ALTER TABLE test_inh_check ALTER COLUMN b TYPE numeric; +NOTICE: merging constraint "bmerged" with inherited definition +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | numeric | | | + b | numeric | | | +Check constraints: + "bmerged" CHECK (b::double precision > 1::double precision) + "bnoinherit" CHECK (b::double precision > 100::double precision) NO INHERIT + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | numeric | | | + b | numeric | | | +Check constraints: + "blocal" CHECK (b::double precision < 1000::double precision) + "bmerged" CHECK (b::double precision > 1::double precision) + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | bmerged | 0 | t | f + test_inh_check | bnoinherit | 0 | t | t + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | blocal | 0 | t | f + test_inh_check_child | bmerged | 1 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(6 rows) + +-- ALTER COLUMN TYPE with different schema in children +-- Bug at https://postgr.es/m/20170102225618.GA10071@telsasoft.com +CREATE TABLE test_type_diff (f1 int); +CREATE TABLE test_type_diff_c (extra smallint) INHERITS (test_type_diff); +ALTER TABLE test_type_diff ADD COLUMN f2 int; +INSERT INTO test_type_diff_c VALUES (1, 2, 3); +ALTER TABLE test_type_diff ALTER COLUMN f2 TYPE bigint USING f2::bigint; +CREATE TABLE test_type_diff2 (int_two int2, int_four int4, int_eight int8); +CREATE TABLE test_type_diff2_c1 (int_four int4, int_eight int8, int_two int2); +CREATE TABLE test_type_diff2_c2 (int_eight int8, int_two int2, int_four int4); +CREATE TABLE test_type_diff2_c3 (int_two int2, int_four int4, int_eight int8); +ALTER TABLE test_type_diff2_c1 INHERIT test_type_diff2; +ALTER TABLE test_type_diff2_c2 INHERIT test_type_diff2; +ALTER TABLE test_type_diff2_c3 INHERIT test_type_diff2; +INSERT INTO test_type_diff2_c1 VALUES (1, 2, 3); +INSERT INTO test_type_diff2_c2 VALUES (4, 5, 6); +INSERT INTO test_type_diff2_c3 VALUES (7, 8, 9); +ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int8 USING int_four::int8; +-- whole-row references are disallowed +ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int4 USING (pg_column_size(test_type_diff2)); +ERROR: cannot convert whole-row table reference +DETAIL: USING expression contains a whole-row table reference. +-- check for rollback of ANALYZE corrupting table property flags (bug #11638) +CREATE TABLE check_fk_presence_1 (id int PRIMARY KEY, t text); +CREATE TABLE check_fk_presence_2 (id int REFERENCES check_fk_presence_1, t text); +BEGIN; +ALTER TABLE check_fk_presence_2 DROP CONSTRAINT check_fk_presence_2_id_fkey; +ANALYZE check_fk_presence_2; +ROLLBACK; +\d check_fk_presence_2 + Table "public.check_fk_presence_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + t | text | | | +Foreign-key constraints: + "check_fk_presence_2_id_fkey" FOREIGN KEY (id) REFERENCES check_fk_presence_1(id) + +DROP TABLE check_fk_presence_1, check_fk_presence_2; +-- check column addition within a view (bug #14876) +create table at_base_table(id int, stuff text); +insert into at_base_table values (23, 'skidoo'); +create view at_view_1 as select * from at_base_table bt; +create view at_view_2 as select *, to_json(v1) as j from at_view_1 v1; +\d+ at_view_1 + View "public.at_view_1" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | +View definition: + SELECT bt.id, + bt.stuff + FROM at_base_table bt; + +\d+ at_view_2 + View "public.at_view_2" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + j | json | | | | extended | +View definition: + SELECT v1.id, + v1.stuff, + to_json(v1.*) AS j + FROM at_view_1 v1; + +explain (verbose, costs off) select * from at_view_2; + QUERY PLAN +---------------------------------------------------------- + Seq Scan on public.at_base_table bt + Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff)) +(2 rows) + +select * from at_view_2; + id | stuff | j +----+--------+---------------------------- + 23 | skidoo | {"id":23,"stuff":"skidoo"} +(1 row) + +create or replace view at_view_1 as select *, 2+2 as more from at_base_table bt; +\d+ at_view_1 + View "public.at_view_1" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + more | integer | | | | plain | +View definition: + SELECT bt.id, + bt.stuff, + 2 + 2 AS more + FROM at_base_table bt; + +\d+ at_view_2 + View "public.at_view_2" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + j | json | | | | extended | +View definition: + SELECT v1.id, + v1.stuff, + to_json(v1.*) AS j + FROM at_view_1 v1; + +explain (verbose, costs off) select * from at_view_2; + QUERY PLAN +---------------------------------------------------------------- + Seq Scan on public.at_base_table bt + Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL)) +(2 rows) + +select * from at_view_2; + id | stuff | j +----+--------+---------------------------------------- + 23 | skidoo | {"id":23,"stuff":"skidoo","more":null} +(1 row) + +drop view at_view_2; +drop view at_view_1; +drop table at_base_table; +-- check adding a column not iself requiring a rewrite, together with +-- a column requiring a default (bug #16038) +-- ensure that rewrites aren't silently optimized away, removing the +-- value of the test +CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text) +RETURNS boolean +LANGUAGE plpgsql AS $$ +DECLARE + v_relfilenode oid; +BEGIN + v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename; + + EXECUTE p_ddl; + + RETURN v_relfilenode <> (SELECT relfilenode FROM pg_class WHERE oid = p_tablename); +END; +$$; +CREATE TABLE rewrite_test(col text); +INSERT INTO rewrite_test VALUES ('something'); +INSERT INTO rewrite_test VALUES (NULL); +-- empty[12] don't need rewrite, but notempty[12]_rewrite will force one +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN empty1 text, + ADD COLUMN notempty1_rewrite serial; +$$); + check_ddl_rewrite +------------------- + t +(1 row) + +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN notempty2_rewrite serial, + ADD COLUMN empty2 text; +$$); + check_ddl_rewrite +------------------- + t +(1 row) + +-- also check that fast defaults cause no problem, first without rewrite +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN empty3 text, + ADD COLUMN notempty3_norewrite int default 42; +$$); + check_ddl_rewrite +------------------- + f +(1 row) + +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN notempty4_norewrite int default 42, + ADD COLUMN empty4 text; +$$); + check_ddl_rewrite +------------------- + f +(1 row) + +-- then with rewrite +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN empty5 text, + ADD COLUMN notempty5_norewrite int default 42, + ADD COLUMN notempty5_rewrite serial; +$$); + check_ddl_rewrite +------------------- + t +(1 row) + +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN notempty6_rewrite serial, + ADD COLUMN empty6 text, + ADD COLUMN notempty6_norewrite int default 42; +$$); + check_ddl_rewrite +------------------- + t +(1 row) + +-- cleanup +DROP FUNCTION check_ddl_rewrite(regclass, text); +DROP TABLE rewrite_test; +-- +-- lock levels +-- +drop type lockmodes; +ERROR: type "lockmodes" does not exist +create type lockmodes as enum ( + 'SIReadLock' +,'AccessShareLock' +,'RowShareLock' +,'RowExclusiveLock' +,'ShareUpdateExclusiveLock' +,'ShareLock' +,'ShareRowExclusiveLock' +,'ExclusiveLock' +,'AccessExclusiveLock' +); +drop view my_locks; +ERROR: view "my_locks" does not exist +create or replace view my_locks as +select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode +from pg_locks l join pg_class c on l.relation = c.oid +where virtualtransaction = ( + select virtualtransaction + from pg_locks + where transactionid = pg_current_xact_id()::xid) +and locktype = 'relation' +and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog') +and c.relname != 'my_locks' +group by c.relname; +create table alterlock (f1 int primary key, f2 text); +insert into alterlock values (1, 'foo'); +create table alterlock2 (f3 int primary key, f1 int); +insert into alterlock2 values (1, 1); +begin; alter table alterlock alter column f2 set statistics 150; +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +rollback; +begin; alter table alterlock cluster on alterlock_pkey; +select * from my_locks order by 1; + relname | max_lockmode +----------------+-------------------------- + alterlock | ShareUpdateExclusiveLock + alterlock_pkey | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock set without cluster; +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +commit; +begin; alter table alterlock set (fillfactor = 100); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock reset (fillfactor); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock set (toast.autovacuum_enabled = off); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock set (autovacuum_enabled = off); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock alter column f2 set (n_distinct = 1); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +rollback; +-- test that mixing options with different lock levels works as expected +begin; alter table alterlock set (autovacuum_enabled = off, fillfactor = 80); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock alter column f2 set storage extended; +select * from my_locks order by 1; + relname | max_lockmode +-----------+--------------------- + alterlock | AccessExclusiveLock +(1 row) + +rollback; +begin; alter table alterlock alter column f2 set default 'x'; +select * from my_locks order by 1; + relname | max_lockmode +-----------+--------------------- + alterlock | AccessExclusiveLock +(1 row) + +rollback; +begin; +create trigger ttdummy + before delete or update on alterlock + for each row + execute procedure + ttdummy (1, 1); +select * from my_locks order by 1; + relname | max_lockmode +-----------+----------------------- + alterlock | ShareRowExclusiveLock +(1 row) + +rollback; +begin; +select * from my_locks order by 1; + relname | max_lockmode +---------+-------------- +(0 rows) + +alter table alterlock2 add foreign key (f1) references alterlock (f1); +select * from my_locks order by 1; + relname | max_lockmode +-----------------+----------------------- + alterlock | ShareRowExclusiveLock + alterlock2 | ShareRowExclusiveLock + alterlock2_pkey | AccessShareLock + alterlock_pkey | AccessShareLock +(4 rows) + +rollback; +begin; +alter table alterlock2 +add constraint alterlock2nv foreign key (f1) references alterlock (f1) NOT VALID; +select * from my_locks order by 1; + relname | max_lockmode +------------+----------------------- + alterlock | ShareRowExclusiveLock + alterlock2 | ShareRowExclusiveLock +(2 rows) + +commit; +begin; +alter table alterlock2 validate constraint alterlock2nv; +select * from my_locks order by 1; + relname | max_lockmode +-----------------+-------------------------- + alterlock | RowShareLock + alterlock2 | ShareUpdateExclusiveLock + alterlock2_pkey | AccessShareLock + alterlock_pkey | AccessShareLock +(4 rows) + +rollback; +create or replace view my_locks as +select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode +from pg_locks l join pg_class c on l.relation = c.oid +where virtualtransaction = ( + select virtualtransaction + from pg_locks + where transactionid = pg_current_xact_id()::xid) +and locktype = 'relation' +and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog') +and c.relname = 'my_locks' +group by c.relname; +-- raise exception +alter table my_locks set (autovacuum_enabled = false); +ERROR: unrecognized parameter "autovacuum_enabled" +alter view my_locks set (autovacuum_enabled = false); +ERROR: unrecognized parameter "autovacuum_enabled" +alter table my_locks reset (autovacuum_enabled); +alter view my_locks reset (autovacuum_enabled); +begin; +alter view my_locks set (security_barrier=off); +select * from my_locks order by 1; + relname | max_lockmode +----------+--------------------- + my_locks | AccessExclusiveLock +(1 row) + +alter view my_locks reset (security_barrier); +rollback; +-- this test intentionally applies the ALTER TABLE command against a view, but +-- uses a view option so we expect this to succeed. This form of SQL is +-- accepted for historical reasons, as shown in the docs for ALTER VIEW +begin; +alter table my_locks set (security_barrier=off); +select * from my_locks order by 1; + relname | max_lockmode +----------+--------------------- + my_locks | AccessExclusiveLock +(1 row) + +alter table my_locks reset (security_barrier); +rollback; +-- cleanup +drop table alterlock2; +drop table alterlock; +drop view my_locks; +drop type lockmodes; +-- +-- alter function +-- +create function test_strict(text) returns text as + 'select coalesce($1, ''got passed a null'');' + language sql returns null on null input; +select test_strict(NULL); + test_strict +------------- + +(1 row) + +alter function test_strict(text) called on null input; +select test_strict(NULL); + test_strict +------------------- + got passed a null +(1 row) + +create function non_strict(text) returns text as + 'select coalesce($1, ''got passed a null'');' + language sql called on null input; +select non_strict(NULL); + non_strict +------------------- + got passed a null +(1 row) + +alter function non_strict(text) returns null on null input; +select non_strict(NULL); + non_strict +------------ + +(1 row) + +-- +-- alter object set schema +-- +create schema alter1; +create schema alter2; +create table alter1.t1(f1 serial primary key, f2 int check (f2 > 0)); +create view alter1.v1 as select * from alter1.t1; +create function alter1.plus1(int) returns int as 'select $1+1' language sql; +create domain alter1.posint integer check (value > 0); +create type alter1.ctype as (f1 int, f2 text); +create function alter1.same(alter1.ctype, alter1.ctype) returns boolean language sql +as 'select $1.f1 is not distinct from $2.f1 and $1.f2 is not distinct from $2.f2'; +create operator alter1.=(procedure = alter1.same, leftarg = alter1.ctype, rightarg = alter1.ctype); +create operator class alter1.ctype_hash_ops default for type alter1.ctype using hash as + operator 1 alter1.=(alter1.ctype, alter1.ctype); +create conversion alter1.latin1_to_utf8 for 'latin1' to 'utf8' from iso8859_1_to_utf8; +create text search parser alter1.prs(start = prsd_start, gettoken = prsd_nexttoken, end = prsd_end, lextypes = prsd_lextype); +create text search configuration alter1.cfg(parser = alter1.prs); +create text search template alter1.tmpl(init = dsimple_init, lexize = dsimple_lexize); +create text search dictionary alter1.dict(template = alter1.tmpl); +insert into alter1.t1(f2) values(11); +insert into alter1.t1(f2) values(12); +alter table alter1.t1 set schema alter1; -- no-op, same schema +alter table alter1.t1 set schema alter2; +alter table alter1.v1 set schema alter2; +alter function alter1.plus1(int) set schema alter2; +alter domain alter1.posint set schema alter2; +alter operator class alter1.ctype_hash_ops using hash set schema alter2; +alter operator family alter1.ctype_hash_ops using hash set schema alter2; +alter operator alter1.=(alter1.ctype, alter1.ctype) set schema alter2; +alter function alter1.same(alter1.ctype, alter1.ctype) set schema alter2; +alter type alter1.ctype set schema alter1; -- no-op, same schema +alter type alter1.ctype set schema alter2; +alter conversion alter1.latin1_to_utf8 set schema alter2; +alter text search parser alter1.prs set schema alter2; +alter text search configuration alter1.cfg set schema alter2; +alter text search template alter1.tmpl set schema alter2; +alter text search dictionary alter1.dict set schema alter2; +-- this should succeed because nothing is left in alter1 +drop schema alter1; +insert into alter2.t1(f2) values(13); +insert into alter2.t1(f2) values(14); +select * from alter2.t1; + f1 | f2 +----+---- + 1 | 11 + 2 | 12 + 3 | 13 + 4 | 14 +(4 rows) + +select * from alter2.v1; + f1 | f2 +----+---- + 1 | 11 + 2 | 12 + 3 | 13 + 4 | 14 +(4 rows) + +select alter2.plus1(41); + plus1 +------- + 42 +(1 row) + +-- clean up +drop schema alter2 cascade; +NOTICE: drop cascades to 13 other objects +DETAIL: drop cascades to table alter2.t1 +drop cascades to view alter2.v1 +drop cascades to function alter2.plus1(integer) +drop cascades to type alter2.posint +drop cascades to type alter2.ctype +drop cascades to function alter2.same(alter2.ctype,alter2.ctype) +drop cascades to operator alter2.=(alter2.ctype,alter2.ctype) +drop cascades to operator family alter2.ctype_hash_ops for access method hash +drop cascades to conversion alter2.latin1_to_utf8 +drop cascades to text search parser alter2.prs +drop cascades to text search configuration alter2.cfg +drop cascades to text search template alter2.tmpl +drop cascades to text search dictionary alter2.dict +-- +-- composite types +-- +CREATE TYPE test_type AS (a int); +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + +ALTER TYPE nosuchtype ADD ATTRIBUTE b text; -- fails +ERROR: relation "nosuchtype" does not exist +ALTER TYPE test_type ADD ATTRIBUTE b text; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + +ALTER TYPE test_type ADD ATTRIBUTE b text; -- fails +ERROR: column "b" of relation "test_type" already exists +ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE varchar; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + +ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE integer; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + +ALTER TYPE test_type DROP ATTRIBUTE b; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + +ALTER TYPE test_type DROP ATTRIBUTE c; -- fails +ERROR: column "c" of relation "test_type" does not exist +ALTER TYPE test_type DROP ATTRIBUTE IF EXISTS c; +NOTICE: column "c" of relation "test_type" does not exist, skipping +ALTER TYPE test_type DROP ATTRIBUTE a, ADD ATTRIBUTE d boolean; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + d | boolean | | | + +ALTER TYPE test_type RENAME ATTRIBUTE a TO aa; +ERROR: column "a" does not exist +ALTER TYPE test_type RENAME ATTRIBUTE d TO dd; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + dd | boolean | | | + +DROP TYPE test_type; +CREATE TYPE test_type1 AS (a int, b text); +CREATE TABLE test_tbl1 (x int, y test_type1); +ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails +ERROR: cannot alter type "test_type1" because column "test_tbl1.y" uses it +CREATE TYPE test_type2 AS (a int, b text); +CREATE TABLE test_tbl2 OF test_type2; +CREATE TABLE test_tbl2_subclass () INHERITS (test_tbl2); +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 ADD ATTRIBUTE c text; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 ADD ATTRIBUTE c text CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 DROP ATTRIBUTE b; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 DROP ATTRIBUTE b CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +\d test_tbl2_subclass + Table "public.test_tbl2_subclass" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | +Inherits: test_tbl2 + +DROP TABLE test_tbl2_subclass; +CREATE TYPE test_typex AS (a int, b text); +CREATE TABLE test_tblx (x int, y test_typex check ((y).a > 0)); +ALTER TYPE test_typex DROP ATTRIBUTE a; -- fails +ERROR: cannot drop column a of composite type test_typex because other objects depend on it +DETAIL: constraint test_tblx_y_check on table test_tblx depends on column a of composite type test_typex +HINT: Use DROP ... CASCADE to drop the dependent objects too. +ALTER TYPE test_typex DROP ATTRIBUTE a CASCADE; +NOTICE: drop cascades to constraint test_tblx_y_check on table test_tblx +\d test_tblx + Table "public.test_tblx" + Column | Type | Collation | Nullable | Default +--------+------------+-----------+----------+--------- + x | integer | | | + y | test_typex | | | + +DROP TABLE test_tblx; +DROP TYPE test_typex; +-- This test isn't that interesting on its own, but the purpose is to leave +-- behind a table to test pg_upgrade with. The table has a composite type +-- column in it, and the composite type has a dropped attribute. +CREATE TYPE test_type3 AS (a int); +CREATE TABLE test_tbl3 (c) AS SELECT '(1)'::test_type3; +ALTER TYPE test_type3 DROP ATTRIBUTE a, ADD ATTRIBUTE b int; +CREATE TYPE test_type_empty AS (); +DROP TYPE test_type_empty; +-- +-- typed tables: OF / NOT OF +-- +CREATE TYPE tt_t0 AS (z inet, x int, y numeric(8,2)); +ALTER TYPE tt_t0 DROP ATTRIBUTE z; +CREATE TABLE tt0 (x int NOT NULL, y numeric(8,2)); -- OK +CREATE TABLE tt1 (x int, y bigint); -- wrong base type +CREATE TABLE tt2 (x int, y numeric(9,2)); -- wrong typmod +CREATE TABLE tt3 (y numeric(8,2), x int); -- wrong column order +CREATE TABLE tt4 (x int); -- too few columns +CREATE TABLE tt5 (x int, y numeric(8,2), z int); -- too few columns +CREATE TABLE tt6 () INHERITS (tt0); -- can't have a parent +CREATE TABLE tt7 (x int, q text, y numeric(8,2)); +ALTER TABLE tt7 DROP q; -- OK +ALTER TABLE tt0 OF tt_t0; +ALTER TABLE tt1 OF tt_t0; +ERROR: table "tt1" has different type for column "y" +ALTER TABLE tt2 OF tt_t0; +ERROR: table "tt2" has different type for column "y" +ALTER TABLE tt3 OF tt_t0; +ERROR: table has column "y" where type requires "x" +ALTER TABLE tt4 OF tt_t0; +ERROR: table is missing column "y" +ALTER TABLE tt5 OF tt_t0; +ERROR: table has extra column "z" +ALTER TABLE tt6 OF tt_t0; +ERROR: typed tables cannot inherit +ALTER TABLE tt7 OF tt_t0; +CREATE TYPE tt_t1 AS (x int, y numeric(8,2)); +ALTER TABLE tt7 OF tt_t1; -- reassign an already-typed table +ALTER TABLE tt7 NOT OF; +\d tt7 + Table "public.tt7" + Column | Type | Collation | Nullable | Default +--------+--------------+-----------+----------+--------- + x | integer | | | + y | numeric(8,2) | | | + +-- make sure we can drop a constraint on the parent but it remains on the child +CREATE TABLE test_drop_constr_parent (c text CHECK (c IS NOT NULL)); +CREATE TABLE test_drop_constr_child () INHERITS (test_drop_constr_parent); +ALTER TABLE ONLY test_drop_constr_parent DROP CONSTRAINT "test_drop_constr_parent_c_check"; +-- should fail +INSERT INTO test_drop_constr_child (c) VALUES (NULL); +ERROR: new row for relation "test_drop_constr_child" violates check constraint "test_drop_constr_parent_c_check" +DETAIL: Failing row contains (null). +DROP TABLE test_drop_constr_parent CASCADE; +NOTICE: drop cascades to table test_drop_constr_child +-- +-- IF EXISTS test +-- +ALTER TABLE IF EXISTS tt8 ADD COLUMN f int; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f); +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10); +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2; +NOTICE: relation "tt8" does not exist, skipping +CREATE TABLE tt8(a int); +CREATE SCHEMA alter2; +ALTER TABLE IF EXISTS tt8 ADD COLUMN f int; +ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f); +ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10); +ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0; +ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1; +ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2; +\d alter2.tt8 + Table "alter2.tt8" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + f1 | integer | | not null | 0 +Indexes: + "xxx" PRIMARY KEY, btree (f1) +Check constraints: + "tt8_f_check" CHECK (f1 >= 0 AND f1 <= 10) + +DROP TABLE alter2.tt8; +DROP SCHEMA alter2; +-- +-- Check conflicts between index and CHECK constraint names +-- +CREATE TABLE tt9(c integer); +ALTER TABLE tt9 ADD CHECK(c > 1); +ALTER TABLE tt9 ADD CHECK(c > 2); -- picks nonconflicting name +ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 3); +ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 4); -- fail, dup name +ERROR: constraint "foo" for relation "tt9" already exists +ALTER TABLE tt9 ADD UNIQUE(c); +ALTER TABLE tt9 ADD UNIQUE(c); -- picks nonconflicting name +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key UNIQUE(c); -- fail, dup name +ERROR: relation "tt9_c_key" already exists +ALTER TABLE tt9 ADD CONSTRAINT foo UNIQUE(c); -- fail, dup name +ERROR: constraint "foo" for relation "tt9" already exists +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key CHECK(c > 5); -- fail, dup name +ERROR: constraint "tt9_c_key" for relation "tt9" already exists +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key2 CHECK(c > 6); +ALTER TABLE tt9 ADD UNIQUE(c); -- picks nonconflicting name +\d tt9 + Table "public.tt9" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c | integer | | | +Indexes: + "tt9_c_key" UNIQUE CONSTRAINT, btree (c) + "tt9_c_key1" UNIQUE CONSTRAINT, btree (c) + "tt9_c_key3" UNIQUE CONSTRAINT, btree (c) +Check constraints: + "foo" CHECK (c > 3) + "tt9_c_check" CHECK (c > 1) + "tt9_c_check1" CHECK (c > 2) + "tt9_c_key2" CHECK (c > 6) + +DROP TABLE tt9; +-- Check that comments on constraints and indexes are not lost at ALTER TABLE. +CREATE TABLE comment_test ( + id int, + positive_col int CHECK (positive_col > 0), + indexed_col int, + CONSTRAINT comment_test_pk PRIMARY KEY (id)); +CREATE INDEX comment_test_index ON comment_test(indexed_col); +COMMENT ON COLUMN comment_test.id IS 'Column ''id'' on comment_test'; +COMMENT ON INDEX comment_test_index IS 'Simple index on comment_test'; +COMMENT ON CONSTRAINT comment_test_positive_col_check ON comment_test IS 'CHECK constraint on comment_test.positive_col'; +COMMENT ON CONSTRAINT comment_test_pk ON comment_test IS 'PRIMARY KEY constraint of comment_test'; +COMMENT ON INDEX comment_test_pk IS 'Index backing the PRIMARY KEY of comment_test'; +SELECT col_description('comment_test'::regclass, 1) as comment; + comment +----------------------------- + Column 'id' on comment_test +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2; + index | comment +--------------------+----------------------------------------------- + comment_test_index | Simple index on comment_test + comment_test_pk | Index backing the PRIMARY KEY of comment_test +(2 rows) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2; + constraint | comment +---------------------------------+----------------------------------------------- + comment_test_pk | PRIMARY KEY constraint of comment_test + comment_test_positive_col_check | CHECK constraint on comment_test.positive_col +(2 rows) + +-- Change the datatype of all the columns. ALTER TABLE is optimized to not +-- rebuild an index if the new data type is binary compatible with the old +-- one. Check do a dummy ALTER TABLE that doesn't change the datatype +-- first, to test that no-op codepath, and another one that does. +ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE bigint; +-- Check that the comments are intact. +SELECT col_description('comment_test'::regclass, 1) as comment; + comment +----------------------------- + Column 'id' on comment_test +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2; + index | comment +--------------------+----------------------------------------------- + comment_test_index | Simple index on comment_test + comment_test_pk | Index backing the PRIMARY KEY of comment_test +(2 rows) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2; + constraint | comment +---------------------------------+----------------------------------------------- + comment_test_pk | PRIMARY KEY constraint of comment_test + comment_test_positive_col_check | CHECK constraint on comment_test.positive_col +(2 rows) + +-- Check compatibility for foreign keys and comments. This is done +-- separately as rebuilding the column type of the parent leads +-- to an error and would reduce the test scope. +CREATE TABLE comment_test_child ( + id text CONSTRAINT comment_test_child_fk REFERENCES comment_test); +CREATE INDEX comment_test_child_fk ON comment_test_child(id); +COMMENT ON COLUMN comment_test_child.id IS 'Column ''id'' on comment_test_child'; +COMMENT ON INDEX comment_test_child_fk IS 'Index backing the FOREIGN KEY of comment_test_child'; +COMMENT ON CONSTRAINT comment_test_child_fk ON comment_test_child IS 'FOREIGN KEY constraint of comment_test_child'; +-- Change column type of parent +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int USING id::integer; +ERROR: foreign key constraint "comment_test_child_fk" cannot be implemented +DETAIL: Key columns "id" and "id" are of incompatible types: text and integer. +-- Comments should be intact +SELECT col_description('comment_test_child'::regclass, 1) as comment; + comment +----------------------------------- + Column 'id' on comment_test_child +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test_child'::regclass ORDER BY 1, 2; + index | comment +-----------------------+----------------------------------------------------- + comment_test_child_fk | Index backing the FOREIGN KEY of comment_test_child +(1 row) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test_child'::regclass ORDER BY 1, 2; + constraint | comment +-----------------------+---------------------------------------------- + comment_test_child_fk | FOREIGN KEY constraint of comment_test_child +(1 row) + +-- Check that we map relation oids to filenodes and back correctly. Only +-- display bad mappings so the test output doesn't change all the time. A +-- filenode function call can return NULL for a relation dropped concurrently +-- with the call's surrounding query, so ignore a NULL mapped_oid for +-- relations that no longer exist after all calls finish. +CREATE TEMP TABLE filenode_mapping AS +SELECT + oid, mapped_oid, reltablespace, relfilenode, relname +FROM pg_class, + pg_filenode_relation(reltablespace, pg_relation_filenode(oid)) AS mapped_oid +WHERE relkind IN ('r', 'i', 'S', 't', 'm') AND mapped_oid IS DISTINCT FROM oid; +SELECT m.* FROM filenode_mapping m LEFT JOIN pg_class c ON c.oid = m.oid +WHERE c.oid IS NOT NULL OR m.mapped_oid IS NOT NULL; + oid | mapped_oid | reltablespace | relfilenode | relname +-----+------------+---------------+-------------+--------- +(0 rows) + +-- Checks on creating and manipulation of user defined relations in +-- pg_catalog. +SHOW allow_system_table_mods; + allow_system_table_mods +------------------------- + off +(1 row) + +-- disallowed because of search_path issues with pg_dump +CREATE TABLE pg_catalog.new_system_table(); +ERROR: permission denied to create "pg_catalog.new_system_table" +DETAIL: System catalog modifications are currently disallowed. +-- instead create in public first, move to catalog +CREATE TABLE new_system_table(id serial primary key, othercol text); +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +ALTER TABLE new_system_table SET SCHEMA public; +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +-- will be ignored -- already there: +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +ALTER TABLE new_system_table RENAME TO old_system_table; +CREATE INDEX old_system_table__othercol ON old_system_table (othercol); +INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata'); +UPDATE old_system_table SET id = -id; +DELETE FROM old_system_table WHERE othercol = 'somedata'; +TRUNCATE old_system_table; +ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey; +ALTER TABLE old_system_table DROP COLUMN othercol; +DROP TABLE old_system_table; +-- set logged +CREATE UNLOGGED TABLE unlogged1(f1 SERIAL PRIMARY KEY, f2 TEXT); +-- check relpersistence of an unlogged table +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1' +ORDER BY relname; + relname | relkind | relpersistence +------------------+---------+---------------- + toast index | i | p + toast table | t | p + unlogged1 | r | p + unlogged1_f1_seq | S | p + unlogged1_pkey | i | p +(5 rows) + +CREATE UNLOGGED TABLE unlogged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged1); -- foreign key +CREATE UNLOGGED TABLE unlogged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged3); -- self-referencing foreign key +ALTER TABLE unlogged3 SET LOGGED; -- skip self-referencing foreign key +ALTER TABLE unlogged2 SET LOGGED; -- fails because a foreign key to an unlogged table exists +ALTER TABLE unlogged1 SET LOGGED; +-- check relpersistence of an unlogged table after changing to permanent +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1' +ORDER BY relname; + relname | relkind | relpersistence +------------------+---------+---------------- + toast index | i | p + toast table | t | p + unlogged1 | r | p + unlogged1_f1_seq | S | p + unlogged1_pkey | i | p +(5 rows) + +ALTER TABLE unlogged1 SET LOGGED; -- silently do nothing +DROP TABLE unlogged3; +DROP TABLE unlogged2; +DROP TABLE unlogged1; +-- set unlogged +CREATE TABLE logged1(f1 SERIAL PRIMARY KEY, f2 TEXT); +-- check relpersistence of a permanent table +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1' +ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + logged1 | r | p + logged1_f1_seq | S | p + logged1_pkey | i | p + toast index | i | p + toast table | t | p +(5 rows) + +CREATE TABLE logged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged1); -- foreign key +CREATE TABLE logged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged3); -- self-referencing foreign key +ALTER TABLE logged1 SET UNLOGGED; -- fails because a foreign key from a permanent table exists +ERROR: could not change table "logged1" to unlogged because it references logged table "logged2" +ALTER TABLE logged3 SET UNLOGGED; -- skip self-referencing foreign key +ALTER TABLE logged2 SET UNLOGGED; +ALTER TABLE logged1 SET UNLOGGED; +-- check relpersistence of a permanent table after changing to unlogged +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1' +ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + logged1 | r | u + logged1_f1_seq | S | p + logged1_pkey | i | u + toast index | i | u + toast table | t | u +(5 rows) + +ALTER TABLE logged1 SET UNLOGGED; -- silently do nothing +DROP TABLE logged3; +DROP TABLE logged2; +DROP TABLE logged1; +-- test ADD COLUMN IF NOT EXISTS +CREATE TABLE test_add_column(c1 integer); +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer; +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer; -- fail because c2 already exists +ERROR: column "c2" of relation "test_add_column" already exists +ALTER TABLE ONLY test_add_column + ADD COLUMN c2 integer; -- fail because c2 already exists +ERROR: column "c2" of relation "test_add_column" already exists +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +ALTER TABLE ONLY test_add_column + ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer, -- fail because c2 already exists + ADD COLUMN c3 integer primary key; +ERROR: column "c2" of relation "test_add_column" already exists +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN c3 integer primary key; +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN IF NOT EXISTS c3 integer primary key; -- skipping because c3 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +NOTICE: column "c3" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN IF NOT EXISTS c3 integer, -- skipping because c3 already exists + ADD COLUMN c4 integer REFERENCES test_add_column; +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +NOTICE: column "c3" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | + c4 | integer | | | +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) +Foreign-key constraints: + "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) +Referenced by: + TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c4 integer REFERENCES test_add_column; +NOTICE: column "c4" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | + c4 | integer | | | +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) +Foreign-key constraints: + "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) +Referenced by: + TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c5 SERIAL CHECK (c5 > 8); +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------------------------------------------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | + c4 | integer | | | + c5 | integer | | not null | nextval('test_add_column_c5_seq'::regclass) +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) +Check constraints: + "test_add_column_c5_check" CHECK (c5 > 8) +Foreign-key constraints: + "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) +Referenced by: + TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c5 SERIAL CHECK (c5 > 10); +NOTICE: column "c5" of relation "test_add_column" already exists, skipping +\d test_add_column* + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------------------------------------------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | + c4 | integer | | | + c5 | integer | | not null | nextval('test_add_column_c5_seq'::regclass) +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) +Check constraints: + "test_add_column_c5_check" CHECK (c5 > 8) +Foreign-key constraints: + "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) +Referenced by: + TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) + + Sequence "public.test_add_column_c5_seq" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +---------+-------+---------+------------+-----------+---------+------- + integer | 1 | 1 | 2147483647 | 1 | no | 1 +Owned by: public.test_add_column.c5 + + Index "public.test_add_column_pkey" + Column | Type | Key? | Definition +--------+---------+------+------------ + c3 | integer | yes | c3 +primary key, btree, for table "public.test_add_column" + +DROP TABLE test_add_column; +\d test_add_column* +-- assorted cases with multiple ALTER TABLE steps +CREATE TABLE ataddindex(f1 INT); +INSERT INTO ataddindex VALUES (42), (43); +CREATE UNIQUE INDEX ataddindexi0 ON ataddindex(f1); +ALTER TABLE ataddindex + ADD PRIMARY KEY USING INDEX ataddindexi0, + ALTER f1 TYPE BIGINT; +\d ataddindex + Table "public.ataddindex" + Column | Type | Collation | Nullable | Default +--------+--------+-----------+----------+--------- + f1 | bigint | | not null | +Indexes: + "ataddindexi0" PRIMARY KEY, btree (f1) + +DROP TABLE ataddindex; +CREATE TABLE ataddindex(f1 VARCHAR(10)); +INSERT INTO ataddindex(f1) VALUES ('foo'), ('a'); +ALTER TABLE ataddindex + ALTER f1 SET DATA TYPE TEXT, + ADD EXCLUDE ((f1 LIKE 'a') WITH =); +\d ataddindex + Table "public.ataddindex" + Column | Type | Collation | Nullable | Default +--------+------+-----------+----------+--------- + f1 | text | | | +Indexes: + "ataddindex_expr_excl" EXCLUDE USING btree ((f1 ~~ 'a'::text) WITH =) + +DROP TABLE ataddindex; +CREATE TABLE ataddindex(id int, ref_id int); +ALTER TABLE ataddindex + ADD PRIMARY KEY (id), + ADD FOREIGN KEY (ref_id) REFERENCES ataddindex; +\d ataddindex + Table "public.ataddindex" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | not null | + ref_id | integer | | | +Indexes: + "ataddindex_pkey" PRIMARY KEY, btree (id) +Foreign-key constraints: + "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id) +Referenced by: + TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id) + +DROP TABLE ataddindex; +CREATE TABLE ataddindex(id int, ref_id int); +ALTER TABLE ataddindex + ADD UNIQUE (id), + ADD FOREIGN KEY (ref_id) REFERENCES ataddindex (id); +\d ataddindex + Table "public.ataddindex" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + ref_id | integer | | | +Indexes: + "ataddindex_id_key" UNIQUE CONSTRAINT, btree (id) +Foreign-key constraints: + "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id) +Referenced by: + TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id) + +DROP TABLE ataddindex; +-- unsupported constraint types for partitioned tables +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE (a, (a+b+1)); +ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); +ERROR: exclusion constraints are not supported on partitioned tables +LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); + ^ +-- cannot drop column that is part of the partition key +ALTER TABLE partitioned DROP COLUMN a; +ERROR: cannot drop column "a" because it is part of the partition key of relation "partitioned" +ALTER TABLE partitioned ALTER COLUMN a TYPE char(5); +ERROR: cannot alter column "a" because it is part of the partition key of relation "partitioned" +ALTER TABLE partitioned DROP COLUMN b; +ERROR: cannot drop column "b" because it is part of the partition key of relation "partitioned" +ALTER TABLE partitioned ALTER COLUMN b TYPE char(5); +ERROR: cannot alter column "b" because it is part of the partition key of relation "partitioned" +-- partitioned table cannot participate in regular inheritance +CREATE TABLE nonpartitioned ( + a int, + b int +); +ALTER TABLE partitioned INHERIT nonpartitioned; +ERROR: cannot change inheritance of partitioned table +ALTER TABLE nonpartitioned INHERIT partitioned; +ERROR: cannot inherit from partitioned table "partitioned" +-- cannot add NO INHERIT constraint to partitioned tables +ALTER TABLE partitioned ADD CONSTRAINT chk_a CHECK (a > 0) NO INHERIT; +ERROR: cannot add NO INHERIT constraint to partitioned table "partitioned" +DROP TABLE partitioned, nonpartitioned; +-- +-- ATTACH PARTITION +-- +-- check that target table is partitioned +CREATE TABLE unparted ( + a int +); +CREATE TABLE fail_part (like unparted); +ALTER TABLE unparted ATTACH PARTITION fail_part FOR VALUES IN ('a'); +ERROR: table "unparted" is not partitioned +DROP TABLE unparted, fail_part; +-- check that partition bound is compatible +CREATE TABLE list_parted ( + a int NOT NULL, + b char(2) COLLATE "C", + CONSTRAINT check_a CHECK (a > 0) +) PARTITION BY LIST (a); +CREATE TABLE fail_part (LIKE list_parted); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) TO (10); +ERROR: invalid bound specification for a list partition +LINE 1: ...list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) T... + ^ +DROP TABLE fail_part; +-- check that the table being attached exists +ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1); +ERROR: relation "nonexistent" does not exist +-- check ownership of the source table +CREATE ROLE regress_test_me; +CREATE ROLE regress_test_not_me; +CREATE TABLE not_owned_by_me (LIKE list_parted); +ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me; +SET SESSION AUTHORIZATION regress_test_me; +CREATE TABLE owned_by_me ( + a int +) PARTITION BY LIST (a); +ALTER TABLE owned_by_me ATTACH PARTITION not_owned_by_me FOR VALUES IN (1); +ERROR: must be owner of table not_owned_by_me +RESET SESSION AUTHORIZATION; +DROP TABLE owned_by_me, not_owned_by_me; +DROP ROLE regress_test_not_me; +DROP ROLE regress_test_me; +-- check that the table being attached is not part of regular inheritance +CREATE TABLE parent (LIKE list_parted); +CREATE TABLE child () INHERITS (parent); +ALTER TABLE list_parted ATTACH PARTITION child FOR VALUES IN (1); +ERROR: cannot attach inheritance child as partition +ALTER TABLE list_parted ATTACH PARTITION parent FOR VALUES IN (1); +ERROR: cannot attach inheritance parent as partition +DROP TABLE parent CASCADE; +NOTICE: drop cascades to table child +-- check any TEMP-ness +CREATE TEMP TABLE temp_parted (a int) PARTITION BY LIST (a); +CREATE TABLE perm_part (a int); +ALTER TABLE temp_parted ATTACH PARTITION perm_part FOR VALUES IN (1); +ERROR: cannot attach a permanent relation as partition of temporary relation "temp_parted" +DROP TABLE temp_parted, perm_part; +-- check that the table being attached is not a typed table +CREATE TYPE mytype AS (a int); +CREATE TABLE fail_part OF mytype; +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: cannot attach a typed table as partition +DROP TYPE mytype CASCADE; +NOTICE: drop cascades to table fail_part +-- check that the table being attached has only columns present in the parent +CREATE TABLE fail_part (like list_parted, c int); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: table "fail_part" contains column "c" not found in parent "list_parted" +DETAIL: The new partition may contain only the columns present in parent. +DROP TABLE fail_part; +-- check that the table being attached has every column of the parent +CREATE TABLE fail_part (a int NOT NULL); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table is missing column "b" +DROP TABLE fail_part; +-- check that columns match in type, collation and NOT NULL status +CREATE TABLE fail_part ( + b char(3), + a int NOT NULL +); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different type for column "b" +ALTER TABLE fail_part ALTER b TYPE char (2) COLLATE "POSIX"; +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different collation for column "b" +DROP TABLE fail_part; +-- check that the table being attached has all constraints of the parent +CREATE TABLE fail_part ( + b char(2) COLLATE "C", + a int NOT NULL +); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table is missing constraint "check_a" +-- check that the constraint matches in definition with parent's constraint +ALTER TABLE fail_part ADD CONSTRAINT check_a CHECK (a >= 0); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different definition for check constraint "check_a" +DROP TABLE fail_part; +-- check the attributes and constraints after partition is attached +CREATE TABLE part_1 ( + a int NOT NULL, + b char(2) COLLATE "C", + CONSTRAINT check_a CHECK (a > 0) +); +ALTER TABLE list_parted ATTACH PARTITION part_1 FOR VALUES IN (1); +-- attislocal and conislocal are always false for merged attributes and constraints respectively. +SELECT attislocal, attinhcount FROM pg_attribute WHERE attrelid = 'part_1'::regclass AND attnum > 0; + attislocal | attinhcount +------------+------------- + f | 1 + f | 1 +(2 rows) + +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::regclass AND conname = 'check_a'; + conislocal | coninhcount +------------+------------- + f | 1 +(1 row) + +-- check that the new partition won't overlap with an existing partition +CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: partition "fail_part" would overlap partition "part_1" +LINE 1: ...LE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); + ^ +DROP TABLE fail_part; +-- check that an existing table can be attached as a default partition +CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT; +-- check attaching default partition fails if a default partition already +-- exists +CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT; +ERROR: partition "fail_def_part" conflicts with existing default partition "def_part" +LINE 1: ...ER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT; + ^ +-- check validation when attaching list partitions +CREATE TABLE list_parted2 ( + a int, + b char +) PARTITION BY LIST (a); +-- check that violating rows are correctly reported +CREATE TABLE part_2 (LIKE list_parted2); +INSERT INTO part_2 VALUES (3, 'a'); +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +ERROR: partition constraint of relation "part_2" is violated by some row +-- should be ok after deleting the bad row +DELETE FROM part_2; +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +-- check partition cannot be attached if default has some row for its values +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; +INSERT INTO list_parted2_def VALUES (11, 'z'); +CREATE TABLE part_3 (LIKE list_parted2); +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); +ERROR: updated partition constraint for default partition "list_parted2_def" would be violated by some row +-- should be ok after deleting the bad row +DELETE FROM list_parted2_def WHERE a = 11; +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); +-- adding constraints that describe the desired partition constraint +-- (or more restrictive) will help skip the validation scan +CREATE TABLE part_3_4 ( + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IN (3)) +); +-- however, if a list partition does not accept nulls, there should be +-- an explicit NOT NULL constraint on the partition key column for the +-- validation scan to be skipped; +ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4); +-- adding a NOT NULL constraint will cause the scan to be skipped +ALTER TABLE list_parted2 DETACH PARTITION part_3_4; +ALTER TABLE part_3_4 ALTER a SET NOT NULL; +ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4); +-- check if default partition scan skipped +ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6)); +CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66); +-- check validation when attaching range partitions +CREATE TABLE range_parted ( + a int, + b int +) PARTITION BY RANGE (a, b); +-- check that violating rows are correctly reported +CREATE TABLE part1 ( + a int NOT NULL CHECK (a = 1), + b int NOT NULL CHECK (b >= 1 AND b <= 10) +); +INSERT INTO part1 VALUES (1, 10); +-- Remember the TO bound is exclusive +ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10); +ERROR: partition constraint of relation "part1" is violated by some row +-- should be ok after deleting the bad row +DELETE FROM part1; +ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10); +-- adding constraints that describe the desired partition constraint +-- (or more restrictive) will help skip the validation scan +CREATE TABLE part2 ( + a int NOT NULL CHECK (a = 1), + b int NOT NULL CHECK (b >= 10 AND b < 18) +); +ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20); +-- Create default partition +CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT; +-- Only one default partition is allowed, hence, following should give error +CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS); +ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT; +ERROR: partition "partr_def2" conflicts with existing default partition "partr_def1" +LINE 1: ...LTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT; + ^ +-- Overlapping partitions cannot be attached, hence, following should give error +INSERT INTO partr_def1 VALUES (2, 10); +CREATE TABLE part3 (LIKE range_parted); +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20); +ERROR: updated partition constraint for default partition "partr_def1" would be violated by some row +-- Attaching partitions should be successful when there are no overlapping rows +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE part_5 ( + LIKE list_parted2 +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE part_5_a PARTITION OF part_5 FOR VALUES IN ('a'); +INSERT INTO part_5_a (a, b) VALUES (6, 'a'); +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +ERROR: partition constraint of relation "part_5_a" is violated by some row +-- delete the faulting row and also add a constraint to skip the scan +DELETE FROM part_5_a WHERE a NOT IN (3); +ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 5); +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +ALTER TABLE list_parted2 DETACH PARTITION part_5; +ALTER TABLE part_5 DROP CONSTRAINT check_a; +-- scan should again be skipped, even though NOT NULL is now a column property +ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IN (5)), ALTER a SET NOT NULL; +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +-- Check the case where attnos of the partitioning columns in the table being +-- attached differs from the parent. It should not affect the constraint- +-- checking logic that allows to skip the scan. +CREATE TABLE part_6 ( + c int, + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 6) +); +ALTER TABLE part_6 DROP c; +ALTER TABLE list_parted2 ATTACH PARTITION part_6 FOR VALUES IN (6); +-- Similar to above, but the table being attached is a partitioned table +-- whose partition has still different attnos for the root partitioning +-- columns. +CREATE TABLE part_7 ( + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7) +) PARTITION BY LIST (b); +CREATE TABLE part_7_a_null ( + c int, + d int, + e int, + LIKE list_parted2, -- 'a' will have attnum = 4 + CONSTRAINT check_b CHECK (b IS NULL OR b = 'a'), + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7) +); +ALTER TABLE part_7_a_null DROP c, DROP d, DROP e; +ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null); +ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); +-- Same example, but check this time that the constraint correctly detects +-- violating rows +ALTER TABLE list_parted2 DETACH PARTITION part_7; +ALTER TABLE part_7 DROP CONSTRAINT check_a; -- thusly, scan won't be skipped +INSERT INTO part_7 (a, b) VALUES (8, null), (9, 'a'); +SELECT tableoid::regclass, a, b FROM part_7 order by a; + tableoid | a | b +---------------+---+--- + part_7_a_null | 8 | + part_7_a_null | 9 | a +(2 rows) + +ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); +ERROR: partition constraint of relation "part_7_a_null" is violated by some row +-- check that leaf partitions of default partition are scanned when +-- attaching a partitioned table. +ALTER TABLE part_5 DROP CONSTRAINT check_a; +CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a); +CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5); +INSERT INTO part5_def_p1 VALUES (5, 'y'); +CREATE TABLE part5_p1 (LIKE part_5); +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); +ERROR: updated partition constraint for default partition "part5_def_p1" would be violated by some row +-- should be ok after deleting the bad row +DELETE FROM part5_def_p1 WHERE b = 'y'; +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); +-- check that the table being attached is not already a partition +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +ERROR: "part_2" is already a partition +-- check that circular inheritance is not allowed +ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b'); +ERROR: circular inheritance not allowed +DETAIL: "part_5" is already a child of "list_parted2". +ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); +ERROR: circular inheritance not allowed +DETAIL: "list_parted2" is already a child of "list_parted2". +-- If a partitioned table being created or an existing table being attached +-- as a partition does not have a constraint that would allow validation scan +-- to be skipped, but an individual partition does, then the partition's +-- validation scan is skipped. +CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a); +CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b); +CREATE TABLE quuux_default1 PARTITION OF quuux_default ( + CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1) +) FOR VALUES IN ('b'); +CREATE TABLE quuux1 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate! +CREATE TABLE quuux2 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation +DROP TABLE quuux1, quuux2; +-- should validate for quuux1, but not for quuux2 +CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1); +CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2); +DROP TABLE quuux; +-- check validation when attaching hash partitions +-- Use hand-rolled hash functions and operator class to get predictable result +-- on different machines. part_test_int4_ops is defined in insert.sql. +-- check that the new partition won't overlap with an existing partition +CREATE TABLE hash_parted ( + a int, + b int +) PARTITION BY HASH (a part_test_int4_ops); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); +CREATE TABLE fail_part (LIKE hpart_1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4); +ERROR: partition "fail_part" would overlap partition "hpart_1" +LINE 1: ...hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODU... + ^ +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0); +ERROR: partition "fail_part" would overlap partition "hpart_1" +LINE 1: ...hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODU... + ^ +DROP TABLE fail_part; +-- check validation when attaching hash partitions +-- check that violating rows are correctly reported +CREATE TABLE hpart_2 (LIKE hash_parted); +INSERT INTO hpart_2 VALUES (3, 0); +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +ERROR: partition constraint of relation "hpart_2" is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_2; +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE hpart_5 ( + LIKE hash_parted +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); +INSERT INTO hpart_5_a (a, b) VALUES (7, 1); +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +ERROR: partition constraint of relation "hpart_5_a" is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_5_a; +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +-- check that the table being attach is with valid modulus and remainder value +CREATE TABLE fail_part(LIKE hash_parted); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DETAIL: The new modulus 3 is not a factor of 4, the modulus of existing partition "hpart_1". +DROP TABLE fail_part; +-- +-- DETACH PARTITION +-- +-- check that the table is partitioned at all +CREATE TABLE regular_table (a int); +ALTER TABLE regular_table DETACH PARTITION any_name; +ERROR: table "regular_table" is not partitioned +DROP TABLE regular_table; +-- check that the partition being detached exists at all +ALTER TABLE list_parted2 DETACH PARTITION part_4; +ERROR: relation "part_4" does not exist +ALTER TABLE hash_parted DETACH PARTITION hpart_4; +ERROR: relation "hpart_4" does not exist +-- check that the partition being detached is actually a partition of the parent +CREATE TABLE not_a_part (a int); +ALTER TABLE list_parted2 DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "list_parted2" +ALTER TABLE list_parted2 DETACH PARTITION part_1; +ERROR: relation "part_1" is not a partition of relation "list_parted2" +ALTER TABLE hash_parted DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "hash_parted" +DROP TABLE not_a_part; +-- check that, after being detached, attinhcount/coninhcount is dropped to 0 and +-- attislocal/conislocal is set to true +ALTER TABLE list_parted2 DETACH PARTITION part_3_4; +SELECT attinhcount, attislocal FROM pg_attribute WHERE attrelid = 'part_3_4'::regclass AND attnum > 0; + attinhcount | attislocal +-------------+------------ + 0 | t + 0 | t +(2 rows) + +SELECT coninhcount, conislocal FROM pg_constraint WHERE conrelid = 'part_3_4'::regclass AND conname = 'check_a'; + coninhcount | conislocal +-------------+------------ + 0 | t +(1 row) + +DROP TABLE part_3_4; +-- check that a detached partition is not dropped on dropping a partitioned table +CREATE TABLE range_parted2 ( + a int +) PARTITION BY RANGE(a); +CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100); +ALTER TABLE range_parted2 DETACH PARTITION part_rp; +DROP TABLE range_parted2; +SELECT * from part_rp; + a +--- +(0 rows) + +DROP TABLE part_rp; +-- concurrent detach +CREATE TABLE range_parted2 ( + a int +) PARTITION BY RANGE(a); +CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100); +BEGIN; +-- doesn't work in a partition block +ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY; +ERROR: ALTER TABLE ... DETACH CONCURRENTLY cannot run inside a transaction block +COMMIT; +CREATE TABLE part_rpd PARTITION OF range_parted2 DEFAULT; +-- doesn't work if there's a default partition +ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY; +ERROR: cannot detach partitions concurrently when a default partition exists +-- doesn't work for the default partition +ALTER TABLE range_parted2 DETACH PARTITION part_rpd CONCURRENTLY; +ERROR: cannot detach partitions concurrently when a default partition exists +DROP TABLE part_rpd; +-- works fine +ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY; +\d+ range_parted2 + Partitioned table "public.range_parted2" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition key: RANGE (a) +Number of partitions: 0 + +-- constraint should be created +\d part_rp + Table "public.part_rp" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Check constraints: + "part_rp_a_check" CHECK (a IS NOT NULL AND a >= 0 AND a < 100) + +CREATE TABLE part_rp100 PARTITION OF range_parted2 (CHECK (a>=123 AND a<133 AND a IS NOT NULL)) FOR VALUES FROM (100) to (200); +ALTER TABLE range_parted2 DETACH PARTITION part_rp100 CONCURRENTLY; +-- redundant constraint should not be created +\d part_rp100 + Table "public.part_rp100" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Check constraints: + "part_rp100_a_check" CHECK (a >= 123 AND a < 133 AND a IS NOT NULL) + +DROP TABLE range_parted2; +-- Check ALTER TABLE commands for partitioned tables and partitions +-- cannot add/drop column to/from *only* the parent +ALTER TABLE ONLY list_parted2 ADD COLUMN c int; +ERROR: column must be added to child tables too +ALTER TABLE ONLY list_parted2 DROP COLUMN b; +ERROR: cannot drop column from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +-- cannot add a column to partition or drop an inherited one +ALTER TABLE part_2 ADD COLUMN c text; +ERROR: cannot add column to a partition +ALTER TABLE part_2 DROP COLUMN b; +ERROR: cannot drop inherited column "b" +-- Nor rename, alter type +ALTER TABLE part_2 RENAME COLUMN b to c; +ERROR: cannot rename inherited column "b" +ALTER TABLE part_2 ALTER COLUMN b TYPE text; +ERROR: cannot alter inherited column "b" +-- cannot add/drop NOT NULL or check constraints to *only* the parent, when +-- partitions exist +ALTER TABLE ONLY list_parted2 ALTER b SET NOT NULL; +ERROR: constraint must be added to child tables too +DETAIL: Column "b" of relation "part_2" is not already NOT NULL. +HINT: Do not specify the ONLY keyword. +ALTER TABLE ONLY list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz'); +ERROR: constraint must be added to child tables too +ALTER TABLE list_parted2 ALTER b SET NOT NULL; +ALTER TABLE ONLY list_parted2 ALTER b DROP NOT NULL; +ERROR: cannot remove constraint from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +ALTER TABLE list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz'); +ALTER TABLE ONLY list_parted2 DROP CONSTRAINT check_b; +ERROR: cannot remove constraint from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +-- It's alright though, if no partitions are yet created +CREATE TABLE parted_no_parts (a int) PARTITION BY LIST (a); +ALTER TABLE ONLY parted_no_parts ALTER a SET NOT NULL; +ALTER TABLE ONLY parted_no_parts ADD CONSTRAINT check_a CHECK (a > 0); +ALTER TABLE ONLY parted_no_parts ALTER a DROP NOT NULL; +ALTER TABLE ONLY parted_no_parts DROP CONSTRAINT check_a; +DROP TABLE parted_no_parts; +-- cannot drop inherited NOT NULL or check constraints from partition +ALTER TABLE list_parted2 ALTER b SET NOT NULL, ADD CONSTRAINT check_a2 CHECK (a > 0); +ALTER TABLE part_2 ALTER b DROP NOT NULL; +ERROR: column "b" is marked NOT NULL in parent table +ALTER TABLE part_2 DROP CONSTRAINT check_a2; +ERROR: cannot drop inherited constraint "check_a2" of relation "part_2" +-- Doesn't make sense to add NO INHERIT constraints on partitioned tables +ALTER TABLE list_parted2 add constraint check_b2 check (b <> 'zz') NO INHERIT; +ERROR: cannot add NO INHERIT constraint to partitioned table "list_parted2" +-- check that a partition cannot participate in regular inheritance +CREATE TABLE inh_test () INHERITS (part_2); +ERROR: cannot inherit from partition "part_2" +CREATE TABLE inh_test (LIKE part_2); +ALTER TABLE inh_test INHERIT part_2; +ERROR: cannot inherit from a partition +ALTER TABLE part_2 INHERIT inh_test; +ERROR: cannot change inheritance of a partition +-- cannot drop or alter type of partition key columns of lower level +-- partitioned tables; for example, part_5, which is list_parted2's +-- partition, is partitioned on b; +ALTER TABLE list_parted2 DROP COLUMN b; +ERROR: cannot drop column "b" because it is part of the partition key of relation "part_5" +ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; +ERROR: cannot alter column "b" because it is part of the partition key of relation "part_5" +-- dropping non-partition key columns should be allowed on the parent table. +ALTER TABLE list_parted DROP COLUMN b; +SELECT * FROM list_parted; + a +--- +(0 rows) + +-- cleanup +DROP TABLE list_parted, list_parted2, range_parted; +DROP TABLE fail_def_part; +DROP TABLE hash_parted; +-- more tests for certain multi-level partitioning scenarios +create table p (a int, b int) partition by range (a, b); +create table p1 (b int, a int not null) partition by range (b); +create table p11 (like p1); +alter table p11 drop a; +alter table p11 add a int; +alter table p11 drop a; +alter table p11 add a int not null; +-- attnum for key attribute 'a' is different in p, p1, and p11 +select attrelid::regclass, attname, attnum +from pg_attribute +where attname = 'a' + and (attrelid = 'p'::regclass + or attrelid = 'p1'::regclass + or attrelid = 'p11'::regclass) +order by attrelid::regclass::text; + attrelid | attname | attnum +----------+---------+-------- + p | a | 1 + p1 | a | 2 + p11 | a | 4 +(3 rows) + +alter table p1 attach partition p11 for values from (2) to (5); +insert into p1 (a, b) values (2, 3); +-- check that partition validation scan correctly detects violating rows +alter table p attach partition p1 for values from (1, 2) to (1, 10); +ERROR: partition constraint of relation "p11" is violated by some row +-- cleanup +drop table p; +drop table p1; +-- validate constraint on partitioned tables should only scan leaf partitions +create table parted_validate_test (a int) partition by list (a); +create table parted_validate_test_1 partition of parted_validate_test for values in (0, 1); +alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid; +alter table parted_validate_test validate constraint parted_validate_test_chka; +drop table parted_validate_test; +-- test alter column options +CREATE TABLE attmp(i integer); +INSERT INTO attmp VALUES (1); +ALTER TABLE attmp ALTER COLUMN i SET (n_distinct = 1, n_distinct_inherited = 2); +ALTER TABLE attmp ALTER COLUMN i RESET (n_distinct_inherited); +ANALYZE attmp; +DROP TABLE attmp; +DROP USER regress_alter_table_user1; +-- check that violating rows are correctly reported when attaching as the +-- default partition +create table defpart_attach_test (a int) partition by list (a); +create table defpart_attach_test1 partition of defpart_attach_test for values in (1); +create table defpart_attach_test_d (b int, a int); +alter table defpart_attach_test_d drop b; +insert into defpart_attach_test_d values (1), (2); +-- error because its constraint as the default partition would be violated +-- by the row containing 1 +alter table defpart_attach_test attach partition defpart_attach_test_d default; +ERROR: partition constraint of relation "defpart_attach_test_d" is violated by some row +delete from defpart_attach_test_d where a = 1; +alter table defpart_attach_test_d add check (a > 1); +-- should be attached successfully and without needing to be scanned +alter table defpart_attach_test attach partition defpart_attach_test_d default; +-- check that attaching a partition correctly reports any rows in the default +-- partition that should not be there for the new partition to be attached +-- successfully +create table defpart_attach_test_2 (like defpart_attach_test_d); +alter table defpart_attach_test attach partition defpart_attach_test_2 for values in (2); +ERROR: updated partition constraint for default partition "defpart_attach_test_d" would be violated by some row +drop table defpart_attach_test; +-- check combinations of temporary and permanent relations when attaching +-- partitions. +create table perm_part_parent (a int) partition by list (a); +create temp table temp_part_parent (a int) partition by list (a); +create table perm_part_child (a int); +create temp table temp_part_child (a int); +alter table temp_part_parent attach partition perm_part_child default; -- error +ERROR: cannot attach a permanent relation as partition of temporary relation "temp_part_parent" +alter table perm_part_parent attach partition temp_part_child default; -- error +ERROR: cannot attach a temporary relation as partition of permanent relation "perm_part_parent" +alter table temp_part_parent attach partition temp_part_child default; -- ok +drop table perm_part_parent cascade; +drop table temp_part_parent cascade; +-- check that attaching partitions to a table while it is being used is +-- prevented +create table tab_part_attach (a int) partition by list (a); +create or replace function func_part_attach() returns trigger + language plpgsql as $$ + begin + execute 'create table tab_part_attach_1 (a int)'; + execute 'alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)'; + return null; + end $$; +create trigger trig_part_attach before insert on tab_part_attach + for each statement execute procedure func_part_attach(); +insert into tab_part_attach values (1); +ERROR: cannot ALTER TABLE "tab_part_attach" because it is being used by active queries in this session +CONTEXT: SQL statement "alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)" +PL/pgSQL function func_part_attach() line 4 at EXECUTE +drop table tab_part_attach; +drop function func_part_attach(); +-- test case where the partitioning operator is a SQL function whose +-- evaluation results in the table's relcache being rebuilt partway through +-- the execution of an ATTACH PARTITION command +create function at_test_sql_partop (int4, int4) returns int language sql +as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$; +create operator class at_test_sql_partop for type int4 using btree as + operator 1 < (int4, int4), operator 2 <= (int4, int4), + operator 3 = (int4, int4), operator 4 >= (int4, int4), + operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4); +create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop); +create table at_test_sql_partop_1 (a int); +alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); +drop table at_test_sql_partop; +drop operator class at_test_sql_partop using btree; +drop function at_test_sql_partop; +/* Test case for bug #16242 */ +-- We create a parent and child where the child has missing +-- non-null attribute values, and arrange to pass them through +-- tuple conversion from the child to the parent tupdesc +create table bar1 (a integer, b integer not null default 1) + partition by range (a); +create table bar2 (a integer); +insert into bar2 values (1); +alter table bar2 add column b integer not null default 1; +-- (at this point bar2 contains tuple with natts=1) +alter table bar1 attach partition bar2 default; +-- this works: +select * from bar1; + a | b +---+--- + 1 | 1 +(1 row) + +-- this exercises tuple conversion: +create function xtrig() + returns trigger language plpgsql +as $$ + declare + r record; + begin + for r in select * from old loop + raise info 'a=%, b=%', r.a, r.b; + end loop; + return NULL; + end; +$$; +create trigger xtrig + after update on bar1 + referencing old table as old + for each statement execute procedure xtrig(); +update bar1 set a = a + 1; +INFO: a=1, b=1 +/* End test case for bug #16242 */ +-- Test that ALTER TABLE rewrite preserves a clustered index +-- for normal indexes and indexes on constraints. +create table alttype_cluster (a int); +alter table alttype_cluster add primary key (a); +create index alttype_cluster_ind on alttype_cluster (a); +alter table alttype_cluster cluster on alttype_cluster_ind; +-- Normal index remains clustered. +select indexrelid::regclass, indisclustered from pg_index + where indrelid = 'alttype_cluster'::regclass + order by indexrelid::regclass::text; + indexrelid | indisclustered +----------------------+---------------- + alttype_cluster_ind | t + alttype_cluster_pkey | f +(2 rows) + +alter table alttype_cluster alter a type bigint; +select indexrelid::regclass, indisclustered from pg_index + where indrelid = 'alttype_cluster'::regclass + order by indexrelid::regclass::text; + indexrelid | indisclustered +----------------------+---------------- + alttype_cluster_ind | t + alttype_cluster_pkey | f +(2 rows) + +-- Constraint index remains clustered. +alter table alttype_cluster cluster on alttype_cluster_pkey; +select indexrelid::regclass, indisclustered from pg_index + where indrelid = 'alttype_cluster'::regclass + order by indexrelid::regclass::text; + indexrelid | indisclustered +----------------------+---------------- + alttype_cluster_ind | f + alttype_cluster_pkey | t +(2 rows) + +alter table alttype_cluster alter a type int; +select indexrelid::regclass, indisclustered from pg_index + where indrelid = 'alttype_cluster'::regclass + order by indexrelid::regclass::text; + indexrelid | indisclustered +----------------------+---------------- + alttype_cluster_ind | f + alttype_cluster_pkey | t +(2 rows) + +drop table alttype_cluster; diff --git a/src/test/regress/expected/create_table_1.out b/src/test/regress/expected/create_table_1.out new file mode 100644 index 00000000000..4ec5f297a34 --- /dev/null +++ b/src/test/regress/expected/create_table_1.out @@ -0,0 +1,1315 @@ +-- +-- CREATE_TABLE +-- +-- +-- CLASS DEFINITIONS +-- +CREATE TABLE hobbies_r ( + name text, + person text +); +CREATE TABLE equipment_r ( + name text, + hobby text +); +CREATE TABLE onek ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +); +CREATE TABLE tenk1 ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +); +CREATE TABLE tenk2 ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +); +CREATE TABLE person ( + name text, + age int4, + location point +); +CREATE TABLE emp ( + salary int4, + manager name +) INHERITS (person); +CREATE TABLE student ( + gpa float8 +) INHERITS (person); +CREATE TABLE stud_emp ( + percent int4 +) INHERITS (emp, student); +NOTICE: merging multiple inherited definitions of column "name" +NOTICE: merging multiple inherited definitions of column "age" +NOTICE: merging multiple inherited definitions of column "location" +CREATE TABLE city ( + name name, + location box, + budget city_budget +); +CREATE TABLE dept ( + dname name, + mgrname text +); +CREATE TABLE slow_emp4000 ( + home_base box +); +CREATE TABLE fast_emp4000 ( + home_base box +); +CREATE TABLE road ( + name text, + thepath path +); +CREATE TABLE ihighway () INHERITS (road); +CREATE TABLE shighway ( + surface text +) INHERITS (road); +CREATE TABLE real_city ( + pop int4, + cname text, + outline path +); +-- +-- test the "star" operators a bit more thoroughly -- this time, +-- throw in lots of NULL fields... +-- +-- a is the type root +-- b and c inherit from a (one-level single inheritance) +-- d inherits from b and c (two-level multiple inheritance) +-- e inherits from c (two-level single inheritance) +-- f inherits from e (three-level single inheritance) +-- +CREATE TABLE a_star ( + class char, + a int4 +); +CREATE TABLE b_star ( + b text +) INHERITS (a_star); +CREATE TABLE c_star ( + c name +) INHERITS (a_star); +CREATE TABLE d_star ( + d float8 +) INHERITS (b_star, c_star); +NOTICE: merging multiple inherited definitions of column "class" +NOTICE: merging multiple inherited definitions of column "a" +CREATE TABLE e_star ( + e int2 +) INHERITS (c_star); +CREATE TABLE f_star ( + f polygon +) INHERITS (e_star); +CREATE TABLE aggtest ( + a int2, + b float4 +); +CREATE TABLE hash_i4_heap ( + seqno int4, + random int4 +); +CREATE TABLE hash_name_heap ( + seqno int4, + random name +); +CREATE TABLE hash_txt_heap ( + seqno int4, + random text +); +CREATE TABLE hash_f8_heap ( + seqno int4, + random float8 +); +-- don't include the hash_ovfl_heap stuff in the distribution +-- the data set is too large for what it's worth +-- +-- CREATE TABLE hash_ovfl_heap ( +-- x int4, +-- y int4 +-- ); +CREATE TABLE bt_i4_heap ( + seqno int4, + random int4 +); +CREATE TABLE bt_name_heap ( + seqno name, + random int4 +); +CREATE TABLE bt_txt_heap ( + seqno text, + random int4 +); +CREATE TABLE bt_f8_heap ( + seqno float8, + random int4 +); +CREATE TABLE array_op_test ( + seqno int4, + i int4[], + t text[] +); +CREATE TABLE array_index_op_test ( + seqno int4, + i int4[], + t text[] +); +CREATE TABLE testjsonb ( + j jsonb +); +CREATE TABLE unknowntab ( + u unknown -- fail +); +ERROR: column "u" has pseudo-type unknown +CREATE TYPE unknown_comptype AS ( + u unknown -- fail +); +ERROR: column "u" has pseudo-type unknown +CREATE TABLE IF NOT EXISTS test_tsvector( + t text, + a tsvector +); +CREATE TABLE IF NOT EXISTS test_tsvector( + t text +); +NOTICE: relation "test_tsvector" already exists, skipping +-- invalid: non-lowercase quoted reloptions identifiers +CREATE TABLE tas_case WITH ("Fillfactor" = 10) AS SELECT 1 a; +ERROR: unrecognized parameter "Fillfactor" +CREATE UNLOGGED TABLE unlogged1 (a int primary key); -- OK +CREATE TEMPORARY TABLE unlogged2 (a int primary key); -- OK +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged\d' ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + unlogged1 | r | p + unlogged1_pkey | i | p + unlogged2 | r | t + unlogged2_pkey | i | t +(4 rows) + +REINDEX INDEX unlogged1_pkey; +REINDEX INDEX unlogged2_pkey; +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged\d' ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + unlogged1 | r | p + unlogged1_pkey | i | p + unlogged2 | r | t + unlogged2_pkey | i | t +(4 rows) + +DROP TABLE unlogged2; +INSERT INTO unlogged1 VALUES (42); +CREATE UNLOGGED TABLE public.unlogged2 (a int primary key); -- also OK +CREATE UNLOGGED TABLE pg_temp.unlogged3 (a int primary key); -- not OK +ERROR: only temporary relations may be created in temporary schemas +LINE 1: CREATE UNLOGGED TABLE pg_temp.unlogged3 (a int primary key); + ^ +CREATE TABLE pg_temp.implicitly_temp (a int primary key); -- OK +CREATE TEMP TABLE explicitly_temp (a int primary key); -- also OK +CREATE TEMP TABLE pg_temp.doubly_temp (a int primary key); -- also OK +CREATE TEMP TABLE public.temp_to_perm (a int primary key); -- not OK +ERROR: cannot create temporary relation in non-temporary schema +LINE 1: CREATE TEMP TABLE public.temp_to_perm (a int primary key); + ^ +DROP TABLE unlogged1, public.unlogged2; +CREATE TABLE as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r'; +CREATE TABLE as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r'; +ERROR: relation "as_select1" already exists +CREATE TABLE IF NOT EXISTS as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r'; +NOTICE: relation "as_select1" already exists, skipping +DROP TABLE as_select1; +PREPARE select1 AS SELECT 1 as a; +CREATE TABLE as_select1 AS EXECUTE select1; +CREATE TABLE as_select1 AS EXECUTE select1; +ERROR: relation "as_select1" already exists +SELECT * FROM as_select1; + a +--- + 1 +(1 row) + +CREATE TABLE IF NOT EXISTS as_select1 AS EXECUTE select1; +NOTICE: relation "as_select1" already exists, skipping +DROP TABLE as_select1; +DEALLOCATE select1; +-- create an extra wide table to test for issues related to that +-- (temporarily hide query, to avoid the long CREATE TABLE stmt) +\set ECHO none +INSERT INTO extra_wide_table(firstc, lastc) VALUES('first col', 'last col'); +SELECT firstc, lastc FROM extra_wide_table; + firstc | lastc +-----------+---------- + first col | last col +(1 row) + +-- check that tables with oids cannot be created anymore +CREATE TABLE withoid() WITH OIDS; +ERROR: syntax error at or near "OIDS" +LINE 1: CREATE TABLE withoid() WITH OIDS; + ^ +CREATE TABLE withoid() WITH (oids); +ERROR: tables declared WITH OIDS are not supported +CREATE TABLE withoid() WITH (oids = true); +ERROR: tables declared WITH OIDS are not supported +-- but explicitly not adding oids is still supported +CREATE TEMP TABLE withoutoid() WITHOUT OIDS; DROP TABLE withoutoid; +CREATE TEMP TABLE withoutoid() WITH (oids = false); DROP TABLE withoutoid; +-- check restriction with default expressions +-- invalid use of column reference in default expressions +CREATE TABLE default_expr_column (id int DEFAULT (id)); +ERROR: cannot use column reference in DEFAULT expression +LINE 1: CREATE TABLE default_expr_column (id int DEFAULT (id)); + ^ +CREATE TABLE default_expr_column (id int DEFAULT (bar.id)); +ERROR: cannot use column reference in DEFAULT expression +LINE 1: CREATE TABLE default_expr_column (id int DEFAULT (bar.id)); + ^ +CREATE TABLE default_expr_agg_column (id int DEFAULT (avg(id))); +ERROR: cannot use column reference in DEFAULT expression +LINE 1: ...TE TABLE default_expr_agg_column (id int DEFAULT (avg(id))); + ^ +-- invalid column definition +CREATE TABLE default_expr_non_column (a int DEFAULT (avg(non_existent))); +ERROR: cannot use column reference in DEFAULT expression +LINE 1: ...TABLE default_expr_non_column (a int DEFAULT (avg(non_existe... + ^ +-- invalid use of aggregate +CREATE TABLE default_expr_agg (a int DEFAULT (avg(1))); +ERROR: aggregate functions are not allowed in DEFAULT expressions +LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (avg(1))); + ^ +-- invalid use of subquery +CREATE TABLE default_expr_agg (a int DEFAULT (select 1)); +ERROR: cannot use subquery in DEFAULT expression +LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (select 1)); + ^ +-- invalid use of set-returning function +CREATE TABLE default_expr_agg (a int DEFAULT (generate_series(1,3))); +ERROR: set-returning functions are not allowed in DEFAULT expressions +LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (generate_serie... + ^ +-- Verify that subtransaction rollback restores rd_createSubid. +BEGIN; +CREATE TABLE remember_create_subid (c int); +SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q; +COMMIT; +DROP TABLE remember_create_subid; +-- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid. +CREATE TABLE remember_node_subid (c int); +BEGIN; +ALTER TABLE remember_node_subid ALTER c TYPE bigint; +SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q; +COMMIT; +DROP TABLE remember_node_subid; +-- +-- Partitioned tables +-- +-- cannot combine INHERITS and PARTITION BY (although grammar allows) +CREATE TABLE partitioned ( + a int +) INHERITS (some_table) PARTITION BY LIST (a); +ERROR: cannot create partitioned table as inheritance child +-- cannot use more than 1 column as partition key for list partitioned table +CREATE TABLE partitioned ( + a1 int, + a2 int +) PARTITION BY LIST (a1, a2); -- fail +ERROR: cannot use "list" partition strategy with more than one column +-- unsupported constraint type for partitioned tables +CREATE TABLE partitioned ( + a int, + EXCLUDE USING gist (a WITH &&) +) PARTITION BY RANGE (a); +ERROR: exclusion constraints are not supported on partitioned tables +LINE 3: EXCLUDE USING gist (a WITH &&) + ^ +-- prevent using prohibited expressions in the key +CREATE FUNCTION retset (a int) RETURNS SETOF int AS $$ SELECT 1; $$ LANGUAGE SQL IMMUTABLE; +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (retset(a)); +ERROR: set-returning functions are not allowed in partition key expressions +DROP FUNCTION retset(int); +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE ((avg(a))); +ERROR: aggregate functions are not allowed in partition key expressions +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE ((avg(a) OVER (PARTITION BY b))); +ERROR: window functions are not allowed in partition key expressions +CREATE TABLE partitioned ( + a int +) PARTITION BY LIST ((a LIKE (SELECT 1))); +ERROR: cannot use subquery in partition key expression +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE ((42)); +ERROR: cannot use constant expression as partition key +CREATE FUNCTION const_func () RETURNS int AS $$ SELECT 1; $$ LANGUAGE SQL IMMUTABLE; +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (const_func()); +ERROR: cannot use constant expression as partition key +DROP FUNCTION const_func(); +-- only accept valid partitioning strategy +CREATE TABLE partitioned ( + a int +) PARTITION BY MAGIC (a); +ERROR: unrecognized partitioning strategy "magic" +-- specified column must be present in the table +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (b); +ERROR: column "b" named in partition key does not exist +LINE 3: ) PARTITION BY RANGE (b); + ^ +-- cannot use system columns in partition key +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (xmin); +ERROR: cannot use system column "xmin" in partition key +LINE 3: ) PARTITION BY RANGE (xmin); + ^ +-- cannot use pseudotypes +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE (((a, b))); +ERROR: partition key column 1 has pseudo-type record +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE (a, ('unknown')); +ERROR: partition key column 2 has pseudo-type unknown +-- functions in key must be immutable +CREATE FUNCTION immut_func (a int) RETURNS int AS $$ SELECT a + random()::int; $$ LANGUAGE SQL; +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (immut_func(a)); +ERROR: functions in partition key expression must be marked IMMUTABLE +DROP FUNCTION immut_func(int); +-- prevent using columns of unsupported types in key (type must have a btree operator class) +CREATE TABLE partitioned ( + a point +) PARTITION BY LIST (a); +ERROR: data type point has no default operator class for access method "btree" +HINT: You must specify a btree operator class or define a default btree operator class for the data type. +CREATE TABLE partitioned ( + a point +) PARTITION BY LIST (a point_ops); +ERROR: operator class "point_ops" does not exist for access method "btree" +CREATE TABLE partitioned ( + a point +) PARTITION BY RANGE (a); +ERROR: data type point has no default operator class for access method "btree" +HINT: You must specify a btree operator class or define a default btree operator class for the data type. +CREATE TABLE partitioned ( + a point +) PARTITION BY RANGE (a point_ops); +ERROR: operator class "point_ops" does not exist for access method "btree" +-- cannot add NO INHERIT constraints to partitioned tables +CREATE TABLE partitioned ( + a int, + CONSTRAINT check_a CHECK (a > 0) NO INHERIT +) PARTITION BY RANGE (a); +ERROR: cannot add NO INHERIT constraint to partitioned table "partitioned" +-- some checks after successful creation of a partitioned table +CREATE FUNCTION plusone(a int) RETURNS INT AS $$ SELECT a+1; $$ LANGUAGE SQL; +CREATE TABLE partitioned ( + a int, + b int, + c text, + d text +) PARTITION BY RANGE (a oid_ops, plusone(b), c collate "default", d collate "C"); +-- check relkind +SELECT relkind FROM pg_class WHERE relname = 'partitioned'; + relkind +--------- + p +(1 row) + +-- prevent a function referenced in partition key from being dropped +DROP FUNCTION plusone(int); +ERROR: cannot drop function plusone(integer) because other objects depend on it +DETAIL: table partitioned depends on function plusone(integer) +HINT: Use DROP ... CASCADE to drop the dependent objects too. +-- partitioned table cannot participate in regular inheritance +CREATE TABLE partitioned2 ( + a int, + b text +) PARTITION BY RANGE ((a+1), substr(b, 1, 5)); +CREATE TABLE fail () INHERITS (partitioned2); +ERROR: cannot inherit from partitioned table "partitioned2" +-- Partition key in describe output +\d partitioned + Partitioned table "public.partitioned" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | + d | text | | | +Partition key: RANGE (a oid_ops, plusone(b), c, d COLLATE "C") +Number of partitions: 0 + +\d+ partitioned2 + Partitioned table "public.partitioned2" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | integer | | | | plain | | + b | text | | | | extended | | +Partition key: RANGE (((a + 1)), substr(b, 1, 5)) +Number of partitions: 0 + +INSERT INTO partitioned2 VALUES (1, 'hello'); +ERROR: no partition of relation "partitioned2" found for row +DETAIL: Partition key of the failing row contains ((a + 1), substr(b, 1, 5)) = (2, hello). +CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO (100, 'ccccc'); +\d+ part2_1 + Table "public.part2_1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | integer | | | | plain | | + b | text | | | | extended | | +Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc') +Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text)))) + +DROP TABLE partitioned, partitioned2; +-- check reference to partitioned table's rowtype in partition descriptor +create table partitioned (a int, b int) + partition by list ((row(a, b)::partitioned)); +create table partitioned1 + partition of partitioned for values in ('(1,2)'::partitioned); +create table partitioned2 + partition of partitioned for values in ('(2,4)'::partitioned); +explain (costs off) +select * from partitioned where row(a,b)::partitioned = '(1,2)'::partitioned; + QUERY PLAN +----------------------------------------------------------- + Seq Scan on partitioned1 partitioned + Filter: (ROW(a, b)::partitioned = '(1,2)'::partitioned) +(2 rows) + +drop table partitioned; +-- whole-row Var in partition key works too +create table partitioned (a int, b int) + partition by list ((partitioned)); +create table partitioned1 + partition of partitioned for values in ('(1,2)'); +create table partitioned2 + partition of partitioned for values in ('(2,4)'); +explain (costs off) +select * from partitioned where partitioned = '(1,2)'::partitioned; + QUERY PLAN +----------------------------------------------------------------- + Seq Scan on partitioned1 partitioned + Filter: ((partitioned.*)::partitioned = '(1,2)'::partitioned) +(2 rows) + +\d+ partitioned1 + Table "public.partitioned1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | +Partition of: partitioned FOR VALUES IN ('(1,2)') +Partition constraint: (((partitioned1.*)::partitioned IS DISTINCT FROM NULL) AND ((partitioned1.*)::partitioned = '(1,2)'::partitioned)) + +drop table partitioned; +-- check that dependencies of partition columns are handled correctly +create domain intdom1 as int; +create table partitioned ( + a intdom1, + b text +) partition by range (a); +alter table partitioned drop column a; -- fail +ERROR: cannot drop column "a" because it is part of the partition key of relation "partitioned" +drop domain intdom1; -- fail, requires cascade +ERROR: cannot drop type intdom1 because other objects depend on it +DETAIL: table partitioned depends on type intdom1 +HINT: Use DROP ... CASCADE to drop the dependent objects too. +drop domain intdom1 cascade; +NOTICE: drop cascades to table partitioned +table partitioned; -- gone +ERROR: relation "partitioned" does not exist +LINE 1: table partitioned; + ^ +-- likewise for columns used in partition expressions +create domain intdom1 as int; +create table partitioned ( + a intdom1, + b text +) partition by range (plusone(a)); +alter table partitioned drop column a; -- fail +ERROR: cannot drop column "a" because it is part of the partition key of relation "partitioned" +drop domain intdom1; -- fail, requires cascade +ERROR: cannot drop type intdom1 because other objects depend on it +DETAIL: table partitioned depends on type intdom1 +HINT: Use DROP ... CASCADE to drop the dependent objects too. +drop domain intdom1 cascade; +NOTICE: drop cascades to table partitioned +table partitioned; -- gone +ERROR: relation "partitioned" does not exist +LINE 1: table partitioned; + ^ +-- +-- Partitions +-- +-- check partition bound syntax +CREATE TABLE list_parted ( + a int +) PARTITION BY LIST (a); +CREATE TABLE part_p1 PARTITION OF list_parted FOR VALUES IN ('1'); +CREATE TABLE part_p2 PARTITION OF list_parted FOR VALUES IN (2); +CREATE TABLE part_p3 PARTITION OF list_parted FOR VALUES IN ((2+1)); +CREATE TABLE part_null PARTITION OF list_parted FOR VALUES IN (null); +\d+ list_parted + Partitioned table "public.list_parted" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition key: LIST (a) +Partitions: part_null FOR VALUES IN (NULL), + part_p1 FOR VALUES IN (1), + part_p2 FOR VALUES IN (2), + part_p3 FOR VALUES IN (3) + +-- forbidden expressions for partition bound with list partitioned table +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (somename); +ERROR: cannot use column reference in partition bound expression +LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (somename); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (somename.somename); +ERROR: cannot use column reference in partition bound expression +LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (somename.s... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (a); +ERROR: cannot use column reference in partition bound expression +LINE 1: ..._bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (a); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(a)); +ERROR: cannot use column reference in partition bound expression +LINE 1: ...s_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(a)); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(somename)); +ERROR: cannot use column reference in partition bound expression +LINE 1: ..._fail PARTITION OF list_parted FOR VALUES IN (sum(somename))... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(1)); +ERROR: aggregate functions are not allowed in partition bound +LINE 1: ...s_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(1)); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN ((select 1)); +ERROR: cannot use subquery in partition bound +LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN ((select 1)... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (generate_series(4, 6)); +ERROR: set-returning functions are not allowed in partition bound +LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (generate_s... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN ((1+1) collate "POSIX"); +ERROR: collations are not supported by type integer +LINE 1: ...ail PARTITION OF list_parted FOR VALUES IN ((1+1) collate "P... + ^ +-- syntax does not allow empty list of values for list partitions +CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN (); +ERROR: syntax error at or near ")" +LINE 1: ...E TABLE fail_part PARTITION OF list_parted FOR VALUES IN (); + ^ +-- trying to specify range for list partitioned table +CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2); +ERROR: invalid bound specification for a list partition +LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) T... + ^ +-- trying to specify modulus and remainder for list partitioned table +CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a list partition +LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODU... + ^ +-- check default partition cannot be created more than once +CREATE TABLE part_default PARTITION OF list_parted DEFAULT; +CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT; +ERROR: partition "fail_default_part" conflicts with existing default partition "part_default" +LINE 1: ...TE TABLE fail_default_part PARTITION OF list_parted DEFAULT; + ^ +-- specified literal can't be cast to the partition column data type +CREATE TABLE bools ( + a bool +) PARTITION BY LIST (a); +CREATE TABLE bools_true PARTITION OF bools FOR VALUES IN (1); +ERROR: specified value cannot be cast to type boolean for column "a" +LINE 1: ...REATE TABLE bools_true PARTITION OF bools FOR VALUES IN (1); + ^ +DROP TABLE bools; +-- specified literal can be cast, and the cast might not be immutable +CREATE TABLE moneyp ( + a money +) PARTITION BY LIST (a); +CREATE TABLE moneyp_10 PARTITION OF moneyp FOR VALUES IN (10); +CREATE TABLE moneyp_11 PARTITION OF moneyp FOR VALUES IN ('11'); +CREATE TABLE moneyp_12 PARTITION OF moneyp FOR VALUES IN (to_char(12, '99')::int); +DROP TABLE moneyp; +-- cast is immutable +CREATE TABLE bigintp ( + a bigint +) PARTITION BY LIST (a); +CREATE TABLE bigintp_10 PARTITION OF bigintp FOR VALUES IN (10); +-- fails due to overlap: +CREATE TABLE bigintp_10_2 PARTITION OF bigintp FOR VALUES IN ('10'); +ERROR: partition "bigintp_10_2" would overlap partition "bigintp_10" +LINE 1: ...ABLE bigintp_10_2 PARTITION OF bigintp FOR VALUES IN ('10'); + ^ +DROP TABLE bigintp; +CREATE TABLE range_parted ( + a date +) PARTITION BY RANGE (a); +-- forbidden expressions for partition bounds with range partitioned table +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (somename) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (somename) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (somename.somename) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (somename.somename) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (a) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (a) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (max(a)) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (max(a)) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (max(somename)) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (max(somename)) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (max('2019-02-01'::date)) TO ('2019-01-01'); +ERROR: aggregate functions are not allowed in partition bound +LINE 2: FOR VALUES FROM (max('2019-02-01'::date)) TO ('2019-01-01'... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM ((select 1)) TO ('2019-01-01'); +ERROR: cannot use subquery in partition bound +LINE 2: FOR VALUES FROM ((select 1)) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01'); +ERROR: set-returning functions are not allowed in partition bound +LINE 2: FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01'); + ^ +-- trying to specify list for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a'); +ERROR: invalid bound specification for a range partition +LINE 1: ...BLE fail_part PARTITION OF range_parted FOR VALUES IN ('a'); + ^ +-- trying to specify modulus and remainder for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a range partition +LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU... + ^ +-- each of start and end bounds must have same number of values as the +-- length of the partition key +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('z'); +ERROR: FROM must specify exactly one value per partitioning column +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a') TO ('z', 1); +ERROR: TO must specify exactly one value per partitioning column +-- cannot specify null values in range bounds +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM (null) TO (maxvalue); +ERROR: cannot specify NULL in range bound +-- trying to specify modulus and remainder for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a range partition +LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU... + ^ +-- check partition bound syntax for the hash partition +CREATE TABLE hash_parted ( + a int +) PARTITION BY HASH (a); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 10, REMAINDER 0); +CREATE TABLE hpart_2 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 50, REMAINDER 1); +CREATE TABLE hpart_3 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 200, REMAINDER 2); +-- modulus 25 is factor of modulus of 50 but 10 is not a factor of 25. +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 25, REMAINDER 3); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DETAIL: The new modulus 25 is not divisible by 10, the modulus of existing partition "hpart_1". +-- previous modulus 50 is factor of 150 but this modulus is not a factor of next modulus 200. +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 150, REMAINDER 3); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DETAIL: The new modulus 150 is not a factor of 200, the modulus of existing partition "hpart_3". +-- trying to specify range for the hash partitioned table +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a', 1) TO ('z'); +ERROR: invalid bound specification for a hash partition +LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a',... + ^ +-- trying to specify list value for the hash partitioned table +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000); +ERROR: invalid bound specification for a hash partition +LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000); + ^ +-- trying to create default partition for the hash partitioned table +CREATE TABLE fail_default_part PARTITION OF hash_parted DEFAULT; +ERROR: a hash-partitioned table may not have a default partition +-- check if compatible with the specified parent +-- cannot create as partition of a non-partitioned table +CREATE TABLE unparted ( + a int +); +CREATE TABLE fail_part PARTITION OF unparted FOR VALUES IN ('a'); +ERROR: "unparted" is not partitioned +CREATE TABLE fail_part PARTITION OF unparted FOR VALUES WITH (MODULUS 2, REMAINDER 1); +ERROR: "unparted" is not partitioned +DROP TABLE unparted; +-- cannot create a permanent rel as partition of a temp rel +CREATE TEMP TABLE temp_parted ( + a int +) PARTITION BY LIST (a); +CREATE TABLE fail_part PARTITION OF temp_parted FOR VALUES IN ('a'); +ERROR: cannot create a permanent relation as partition of temporary relation "temp_parted" +DROP TABLE temp_parted; +-- check for partition bound overlap and other invalid specifications +CREATE TABLE list_parted2 ( + a varchar +) PARTITION BY LIST (a); +CREATE TABLE part_null_z PARTITION OF list_parted2 FOR VALUES IN (null, 'z'); +CREATE TABLE part_ab PARTITION OF list_parted2 FOR VALUES IN ('a', 'b'); +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; +CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN (null); +ERROR: partition "fail_part" would overlap partition "part_null_z" +LINE 1: ...LE fail_part PARTITION OF list_parted2 FOR VALUES IN (null); + ^ +CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c'); +ERROR: partition "fail_part" would overlap partition "part_ab" +LINE 1: ...ail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c'); + ^ +-- check default partition overlap +INSERT INTO list_parted2 VALUES('X'); +CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('W', 'X', 'Y'); +ERROR: updated partition constraint for default partition "list_parted2_def" would be violated by some row +CREATE TABLE range_parted2 ( + a int +) PARTITION BY RANGE (a); +-- trying to create range partition with empty range +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (0); +ERROR: empty range bound specified for partition "fail_part" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (0); + ^ +DETAIL: Specified lower bound (1) is greater than or equal to upper bound (0). +-- note that the range '[1, 1)' has no elements +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (1); +ERROR: empty range bound specified for partition "fail_part" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (1); + ^ +DETAIL: Specified lower bound (1) is greater than or equal to upper bound (1). +CREATE TABLE part0 PARTITION OF range_parted2 FOR VALUES FROM (minvalue) TO (1); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (minvalue) TO (2); +ERROR: partition "fail_part" would overlap partition "part0" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (minvalue) ... + ^ +CREATE TABLE part1 PARTITION OF range_parted2 FOR VALUES FROM (1) TO (10); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (-1) TO (1); +ERROR: partition "fail_part" would overlap partition "part0" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (-1) TO (1)... + ^ +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (9) TO (maxvalue); +ERROR: partition "fail_part" would overlap partition "part1" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (9) TO (max... + ^ +CREATE TABLE part2 PARTITION OF range_parted2 FOR VALUES FROM (20) TO (30); +CREATE TABLE part3 PARTITION OF range_parted2 FOR VALUES FROM (30) TO (40); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30); +ERROR: partition "fail_part" would overlap partition "part2" +LINE 1: ...art PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30); + ^ +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50); +ERROR: partition "fail_part" would overlap partition "part2" +LINE 1: ...art PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50); + ^ +-- Create a default partition for range partitioned table +CREATE TABLE range2_default PARTITION OF range_parted2 DEFAULT; +-- More than one default partition is not allowed, so this should give error +CREATE TABLE fail_default_part PARTITION OF range_parted2 DEFAULT; +ERROR: partition "fail_default_part" conflicts with existing default partition "range2_default" +LINE 1: ... TABLE fail_default_part PARTITION OF range_parted2 DEFAULT; + ^ +-- Check if the range for default partitions overlap +INSERT INTO range_parted2 VALUES (85); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (80) TO (90); +ERROR: updated partition constraint for default partition "range2_default" would be violated by some row +CREATE TABLE part4 PARTITION OF range_parted2 FOR VALUES FROM (90) TO (100); +-- now check for multi-column range partition key +CREATE TABLE range_parted3 ( + a int, + b int +) PARTITION BY RANGE (a, (b+1)); +CREATE TABLE part00 PARTITION OF range_parted3 FOR VALUES FROM (0, minvalue) TO (0, maxvalue); +CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (0, minvalue) TO (0, 1); +ERROR: partition "fail_part" would overlap partition "part00" +LINE 1: ..._part PARTITION OF range_parted3 FOR VALUES FROM (0, minvalu... + ^ +CREATE TABLE part10 PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, 1); +CREATE TABLE part11 PARTITION OF range_parted3 FOR VALUES FROM (1, 1) TO (1, 10); +CREATE TABLE part12 PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, maxvalue); +CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, 20); +ERROR: partition "fail_part" would overlap partition "part12" +LINE 1: ...rt PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1,... + ^ +CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT; +-- cannot create a partition that says column b is allowed to range +-- from -infinity to +infinity, while there exist partitions that have +-- more specific ranges +CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, maxvalue); +ERROR: partition "fail_part" would overlap partition "part10" +LINE 1: ..._part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalu... + ^ +-- check for partition bound overlap and other invalid specifications for the hash partition +CREATE TABLE hash_parted2 ( + a varchar +) PARTITION BY HASH (a); +CREATE TABLE h2part_1 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +CREATE TABLE h2part_2 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 0); +CREATE TABLE h2part_3 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 4); +CREATE TABLE h2part_4 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 5); +-- overlap with part_4 +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); +ERROR: partition "fail_part" would overlap partition "h2part_4" +LINE 1: ...LE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODU... + ^ +-- modulus must be greater than zero +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +-- remainder must be greater than or equal to zero and less than modulus +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +-- check schema propagation from parent +CREATE TABLE parted ( + a text, + b int NOT NULL DEFAULT 0, + CONSTRAINT check_a CHECK (length(a) > 0) +) PARTITION BY LIST (a); +CREATE TABLE part_a PARTITION OF parted FOR VALUES IN ('a'); +-- only inherited attributes (never local ones) +SELECT attname, attislocal, attinhcount FROM pg_attribute + WHERE attrelid = 'part_a'::regclass and attnum > 0 + ORDER BY attnum; + attname | attislocal | attinhcount +---------+------------+------------- + a | f | 1 + b | f | 1 +(2 rows) + +-- able to specify column default, column constraint, and table constraint +-- first check the "column specified more than once" error +CREATE TABLE part_b PARTITION OF parted ( + b NOT NULL, + b DEFAULT 1, + b CHECK (b >= 0), + CONSTRAINT check_a CHECK (length(a) > 0) +) FOR VALUES IN ('b'); +ERROR: column "b" specified more than once +CREATE TABLE part_b PARTITION OF parted ( + b NOT NULL DEFAULT 1, + CONSTRAINT check_a CHECK (length(a) > 0), + CONSTRAINT check_b CHECK (b >= 0) +) FOR VALUES IN ('b'); +NOTICE: merging constraint "check_a" with inherited definition +-- conislocal should be false for any merged constraints, true otherwise +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass ORDER BY conislocal, coninhcount; + conislocal | coninhcount +------------+------------- + f | 1 + t | 0 +(2 rows) + +-- Once check_b is added to the parent, it should be made non-local for part_b +ALTER TABLE parted ADD CONSTRAINT check_b CHECK (b >= 0); +NOTICE: merging constraint "check_b" with inherited definition +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass; + conislocal | coninhcount +------------+------------- + f | 1 + f | 1 +(2 rows) + +-- Neither check_a nor check_b are droppable from part_b +ALTER TABLE part_b DROP CONSTRAINT check_a; +ERROR: cannot drop inherited constraint "check_a" of relation "part_b" +ALTER TABLE part_b DROP CONSTRAINT check_b; +ERROR: cannot drop inherited constraint "check_b" of relation "part_b" +-- And dropping it from parted should leave no trace of them on part_b, unlike +-- traditional inheritance where they will be left behind, because they would +-- be local constraints. +ALTER TABLE parted DROP CONSTRAINT check_a, DROP CONSTRAINT check_b; +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass; + conislocal | coninhcount +------------+------------- +(0 rows) + +-- specify PARTITION BY for a partition +CREATE TABLE fail_part_col_not_found PARTITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c); +ERROR: column "c" named in partition key does not exist +LINE 1: ...TITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c); + ^ +CREATE TABLE part_c PARTITION OF parted (b WITH OPTIONS NOT NULL DEFAULT 0) FOR VALUES IN ('c') PARTITION BY RANGE ((b)); +-- create a level-2 partition +CREATE TABLE part_c_1_10 PARTITION OF part_c FOR VALUES FROM (1) TO (10); +-- check that NOT NULL and default value are inherited correctly +create table parted_notnull_inh_test (a int default 1, b int not null default 0) partition by list (a); +create table parted_notnull_inh_test1 partition of parted_notnull_inh_test (a not null, b default 1) for values in (1); +insert into parted_notnull_inh_test (b) values (null); +ERROR: null value in column "b" of relation "parted_notnull_inh_test1" violates not-null constraint +DETAIL: Failing row contains (1, null). +-- note that while b's default is overriden, a's default is preserved +\d parted_notnull_inh_test1 + Table "public.parted_notnull_inh_test1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | 1 + b | integer | | not null | 1 +Partition of: parted_notnull_inh_test FOR VALUES IN (1) + +drop table parted_notnull_inh_test; +-- check that collations are assigned in partition bound expressions +create table parted_boolean_col (a bool, b text) partition by list(a); +create table parted_boolean_less partition of parted_boolean_col + for values in ('foo' < 'bar'); +create table parted_boolean_greater partition of parted_boolean_col + for values in ('foo' > 'bar'); +drop table parted_boolean_col; +-- check for a conflicting COLLATE clause +create table parted_collate_must_match (a text collate "C", b text collate "C") + partition by range (a); +-- on the partition key +create table parted_collate_must_match1 partition of parted_collate_must_match + (a collate "POSIX") for values from ('a') to ('m'); +-- on another column +create table parted_collate_must_match2 partition of parted_collate_must_match + (b collate "POSIX") for values from ('m') to ('z'); +drop table parted_collate_must_match; +-- check that non-matching collations for partition bound +-- expressions are coerced to the right collation +create table test_part_coll_posix (a text) partition by range (a collate "POSIX"); +-- ok, collation is implicitly coerced +create table test_part_coll partition of test_part_coll_posix for values from ('a' collate "C") to ('g'); +-- ok +create table test_part_coll2 partition of test_part_coll_posix for values from ('g') to ('m'); +-- ok, collation is implicitly coerced +create table test_part_coll_cast partition of test_part_coll_posix for values from (name 'm' collate "C") to ('s'); +-- ok; partition collation silently overrides the default collation of type 'name' +create table test_part_coll_cast2 partition of test_part_coll_posix for values from (name 's') to ('z'); +drop table test_part_coll_posix; +-- Partition bound in describe output +\d+ part_b + Table "public.part_b" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | integer | | not null | 1 | plain | | +Partition of: parted FOR VALUES IN ('b') +Partition constraint: ((a IS NOT NULL) AND (a = 'b'::text)) + +-- Both partition bound and partition key in describe output +\d+ part_c + Partitioned table "public.part_c" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | integer | | not null | 0 | plain | | +Partition of: parted FOR VALUES IN ('c') +Partition constraint: ((a IS NOT NULL) AND (a = 'c'::text)) +Partition key: RANGE (b) +Partitions: part_c_1_10 FOR VALUES FROM (1) TO (10) + +-- a level-2 partition's constraint will include the parent's expressions +\d+ part_c_1_10 + Table "public.part_c_1_10" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | integer | | not null | 0 | plain | | +Partition of: part_c FOR VALUES FROM (1) TO (10) +Partition constraint: ((a IS NOT NULL) AND (a = 'c'::text) AND (b IS NOT NULL) AND (b >= 1) AND (b < 10)) + +-- Show partition count in the parent's describe output +-- Tempted to include \d+ output listing partitions with bound info but +-- output could vary depending on the order in which partition oids are +-- returned. +\d parted + Partitioned table "public.parted" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | text | | | + b | integer | | not null | 0 +Partition key: LIST (a) +Number of partitions: 3 (Use \d+ to list them.) + +\d hash_parted + Partitioned table "public.hash_parted" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition key: HASH (a) +Number of partitions: 3 (Use \d+ to list them.) + +-- check that we get the expected partition constraints +CREATE TABLE range_parted4 (a int, b int, c int) PARTITION BY RANGE (abs(a), abs(b), c); +CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE); +\d+ unbounded_range_part + Table "public.unbounded_range_part" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | + c | integer | | | | plain | | +Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE) +Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL)) + +DROP TABLE unbounded_range_part; +CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE); +\d+ range_parted4_1 + Table "public.range_parted4_1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | + c | integer | | | | plain | | +Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE) +Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND (abs(a) <= 1)) + +CREATE TABLE range_parted4_2 PARTITION OF range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE); +\d+ range_parted4_2 + Table "public.range_parted4_2" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | + c | integer | | | | plain | | +Partition of: range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE) +Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 3) OR ((abs(a) = 3) AND (abs(b) > 4)) OR ((abs(a) = 3) AND (abs(b) = 4) AND (c >= 5))) AND ((abs(a) < 6) OR ((abs(a) = 6) AND (abs(b) <= 7)))) + +CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE); +\d+ range_parted4_3 + Table "public.range_parted4_3" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | + c | integer | | | | plain | | +Partition of: range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE) +Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 6) OR ((abs(a) = 6) AND (abs(b) >= 8))) AND (abs(a) <= 9)) + +DROP TABLE range_parted4; +-- user-defined operator class in partition key +CREATE FUNCTION my_int4_sort(int4,int4) RETURNS int LANGUAGE sql + AS $$ SELECT CASE WHEN $1 = $2 THEN 0 WHEN $1 > $2 THEN 1 ELSE -1 END; $$; +CREATE OPERATOR CLASS test_int4_ops FOR TYPE int4 USING btree AS + OPERATOR 1 < (int4,int4), OPERATOR 2 <= (int4,int4), + OPERATOR 3 = (int4,int4), OPERATOR 4 >= (int4,int4), + OPERATOR 5 > (int4,int4), FUNCTION 1 my_int4_sort(int4,int4); +CREATE TABLE partkey_t (a int4) PARTITION BY RANGE (a test_int4_ops); +CREATE TABLE partkey_t_1 PARTITION OF partkey_t FOR VALUES FROM (0) TO (1000); +INSERT INTO partkey_t VALUES (100); +INSERT INTO partkey_t VALUES (200); +-- cleanup +DROP TABLE parted, list_parted, range_parted, list_parted2, range_parted2, range_parted3; +DROP TABLE partkey_t, hash_parted, hash_parted2; +DROP OPERATOR CLASS test_int4_ops USING btree; +DROP FUNCTION my_int4_sort(int4,int4); +-- comments on partitioned tables columns +CREATE TABLE parted_col_comment (a int, b text) PARTITION BY LIST (a); +COMMENT ON TABLE parted_col_comment IS 'Am partitioned table'; +COMMENT ON COLUMN parted_col_comment.a IS 'Partition key'; +SELECT obj_description('parted_col_comment'::regclass); + obj_description +---------------------- + Am partitioned table +(1 row) + +\d+ parted_col_comment + Partitioned table "public.parted_col_comment" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+--------------- + a | integer | | | | plain | | Partition key + b | text | | | | extended | | +Partition key: LIST (a) +Number of partitions: 0 + +DROP TABLE parted_col_comment; +-- list partitioning on array type column +CREATE TABLE arrlp (a int[]) PARTITION BY LIST (a); +CREATE TABLE arrlp12 PARTITION OF arrlp FOR VALUES IN ('{1}', '{2}'); +\d+ arrlp12 + Table "public.arrlp12" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+-----------+-----------+----------+---------+----------+--------------+------------- + a | integer[] | | | | extended | | +Partition of: arrlp FOR VALUES IN ('{1}', '{2}') +Partition constraint: ((a IS NOT NULL) AND ((a = '{1}'::integer[]) OR (a = '{2}'::integer[]))) + +DROP TABLE arrlp; +-- partition on boolean column +create table boolspart (a bool) partition by list (a); +create table boolspart_t partition of boolspart for values in (true); +create table boolspart_f partition of boolspart for values in (false); +\d+ boolspart + Partitioned table "public.boolspart" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | boolean | | | | plain | | +Partition key: LIST (a) +Partitions: boolspart_f FOR VALUES IN (false), + boolspart_t FOR VALUES IN (true) + +drop table boolspart; +-- partitions mixing temporary and permanent relations +create table perm_parted (a int) partition by list (a); +create temporary table temp_parted (a int) partition by list (a); +create table perm_part partition of temp_parted default; -- error +ERROR: cannot create a permanent relation as partition of temporary relation "temp_parted" +create temp table temp_part partition of perm_parted default; -- error +ERROR: cannot create a temporary relation as partition of permanent relation "perm_parted" +create temp table temp_part partition of temp_parted default; -- ok +drop table perm_parted cascade; +drop table temp_parted cascade; +-- check that adding partitions to a table while it is being used is prevented +create table tab_part_create (a int) partition by list (a); +create or replace function func_part_create() returns trigger + language plpgsql as $$ + begin + execute 'create table tab_part_create_1 partition of tab_part_create for values in (1)'; + return null; + end $$; +create trigger trig_part_create before insert on tab_part_create + for each statement execute procedure func_part_create(); +insert into tab_part_create values (1); +ERROR: cannot CREATE TABLE .. PARTITION OF "tab_part_create" because it is being used by active queries in this session +CONTEXT: SQL statement "create table tab_part_create_1 partition of tab_part_create for values in (1)" +PL/pgSQL function func_part_create() line 3 at EXECUTE +drop table tab_part_create; +drop function func_part_create(); +-- test using a volatile expression as partition bound +create table volatile_partbound_test (partkey timestamp) partition by range (partkey); +create table volatile_partbound_test1 partition of volatile_partbound_test for values from (minvalue) to (current_timestamp); +create table volatile_partbound_test2 partition of volatile_partbound_test for values from (current_timestamp) to (maxvalue); +-- this should go into the partition volatile_partbound_test2 +insert into volatile_partbound_test values (current_timestamp); +select tableoid::regclass from volatile_partbound_test; + tableoid +-------------------------- + volatile_partbound_test2 +(1 row) + +drop table volatile_partbound_test; +-- test the case where a check constraint on default partition allows +-- to avoid scanning it when adding a new partition +create table defcheck (a int, b int) partition by list (b); +create table defcheck_def (a int, c int, b int); +alter table defcheck_def drop c; +alter table defcheck attach partition defcheck_def default; +alter table defcheck_def add check (b <= 0 and b is not null); +create table defcheck_1 partition of defcheck for values in (1, null); +-- test that complex default partition constraints are enforced correctly +insert into defcheck_def values (0, 0); +create table defcheck_0 partition of defcheck for values in (0); +ERROR: updated partition constraint for default partition "defcheck_def" would be violated by some row +drop table defcheck; +-- tests of column drop with partition tables and indexes using +-- predicates and expressions. +create table part_column_drop ( + useless_1 int, + id int, + useless_2 int, + d int, + b int, + useless_3 int +) partition by range (id); +alter table part_column_drop drop column useless_1; +alter table part_column_drop drop column useless_2; +alter table part_column_drop drop column useless_3; +create index part_column_drop_b_pred on part_column_drop(b) where b = 1; +create index part_column_drop_b_expr on part_column_drop((b = 1)); +create index part_column_drop_d_pred on part_column_drop(d) where d = 2; +create index part_column_drop_d_expr on part_column_drop((d = 2)); +create table part_column_drop_1_10 partition of + part_column_drop for values from (1) to (10); +\d part_column_drop + Partitioned table "public.part_column_drop" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + d | integer | | | + b | integer | | | +Partition key: RANGE (id) +Indexes: + "part_column_drop_b_expr" btree ((b = 1)) + "part_column_drop_b_pred" btree (b) WHERE b = 1 + "part_column_drop_d_expr" btree ((d = 2)) + "part_column_drop_d_pred" btree (d) WHERE d = 2 +Number of partitions: 1 (Use \d+ to list them.) + +\d part_column_drop_1_10 + Table "public.part_column_drop_1_10" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + d | integer | | | + b | integer | | | +Partition of: part_column_drop FOR VALUES FROM (1) TO (10) +Indexes: + "part_column_drop_1_10_b_idx" btree (b) WHERE b = 1 + "part_column_drop_1_10_d_idx" btree (d) WHERE d = 2 + "part_column_drop_1_10_expr_idx" btree ((b = 1)) + "part_column_drop_1_10_expr_idx1" btree ((d = 2)) + +drop table part_column_drop; diff --git a/src/test/regress/expected/sequence_1.out b/src/test/regress/expected/sequence_1.out new file mode 100644 index 00000000000..462e3f3caa4 --- /dev/null +++ b/src/test/regress/expected/sequence_1.out @@ -0,0 +1,824 @@ +-- +-- CREATE SEQUENCE +-- +-- various error cases +CREATE UNLOGGED SEQUENCE sequence_testx; +ERROR: unlogged sequences are not supported +CREATE SEQUENCE sequence_testx INCREMENT BY 0; +ERROR: INCREMENT must not be zero +CREATE SEQUENCE sequence_testx INCREMENT BY -1 MINVALUE 20; +ERROR: MINVALUE (20) must be less than MAXVALUE (-1) +CREATE SEQUENCE sequence_testx INCREMENT BY 1 MAXVALUE -20; +ERROR: MINVALUE (1) must be less than MAXVALUE (-20) +CREATE SEQUENCE sequence_testx INCREMENT BY -1 START 10; +ERROR: START value (10) cannot be greater than MAXVALUE (-1) +CREATE SEQUENCE sequence_testx INCREMENT BY 1 START -10; +ERROR: START value (-10) cannot be less than MINVALUE (1) +CREATE SEQUENCE sequence_testx CACHE 0; +ERROR: CACHE (0) must be greater than zero +-- OWNED BY errors +CREATE SEQUENCE sequence_testx OWNED BY nobody; -- nonsense word +ERROR: invalid OWNED BY option +HINT: Specify OWNED BY table.column or OWNED BY NONE. +CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid; -- not a table +ERROR: referenced relation "pg_class_oid_index" is not a table or foreign table +CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname; -- not same schema +ERROR: sequence must be in same schema as table it is linked to +CREATE TABLE sequence_test_table (a int); +CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b; -- wrong column +ERROR: column "b" of relation "sequence_test_table" does not exist +DROP TABLE sequence_test_table; +-- sequence data types +CREATE SEQUENCE sequence_test5 AS integer; +CREATE SEQUENCE sequence_test6 AS smallint; +CREATE SEQUENCE sequence_test7 AS bigint; +CREATE SEQUENCE sequence_test8 AS integer MAXVALUE 100000; +CREATE SEQUENCE sequence_test9 AS integer INCREMENT BY -1; +CREATE SEQUENCE sequence_test10 AS integer MINVALUE -100000 START 1; +CREATE SEQUENCE sequence_test11 AS smallint; +CREATE SEQUENCE sequence_test12 AS smallint INCREMENT -1; +CREATE SEQUENCE sequence_test13 AS smallint MINVALUE -32768; +CREATE SEQUENCE sequence_test14 AS smallint MAXVALUE 32767 INCREMENT -1; +CREATE SEQUENCE sequence_testx AS text; +ERROR: sequence type must be smallint, integer, or bigint +CREATE SEQUENCE sequence_testx AS nosuchtype; +ERROR: type "nosuchtype" does not exist +LINE 1: CREATE SEQUENCE sequence_testx AS nosuchtype; + ^ +CREATE SEQUENCE sequence_testx AS smallint MAXVALUE 100000; +ERROR: MAXVALUE (100000) is out of range for sequence data type smallint +CREATE SEQUENCE sequence_testx AS smallint MINVALUE -100000; +ERROR: MINVALUE (-100000) is out of range for sequence data type smallint +ALTER SEQUENCE sequence_test5 AS smallint; -- success, max will be adjusted +ALTER SEQUENCE sequence_test8 AS smallint; -- fail, max has to be adjusted +ERROR: MAXVALUE (100000) is out of range for sequence data type smallint +ALTER SEQUENCE sequence_test8 AS smallint MAXVALUE 20000; -- ok now +ALTER SEQUENCE sequence_test9 AS smallint; -- success, min will be adjusted +ALTER SEQUENCE sequence_test10 AS smallint; -- fail, min has to be adjusted +ERROR: MINVALUE (-100000) is out of range for sequence data type smallint +ALTER SEQUENCE sequence_test10 AS smallint MINVALUE -20000; -- ok now +ALTER SEQUENCE sequence_test11 AS int; -- max will be adjusted +ALTER SEQUENCE sequence_test12 AS int; -- min will be adjusted +ALTER SEQUENCE sequence_test13 AS int; -- min and max will be adjusted +ALTER SEQUENCE sequence_test14 AS int; -- min and max will be adjusted +--- +--- test creation of SERIAL column +--- +CREATE TABLE serialTest1 (f1 text, f2 serial); +INSERT INTO serialTest1 VALUES ('foo'); +INSERT INTO serialTest1 VALUES ('bar'); +INSERT INTO serialTest1 VALUES ('force', 100); +INSERT INTO serialTest1 VALUES ('wrong', NULL); +ERROR: null value in column "f2" of relation "serialtest1" violates not-null constraint +DETAIL: Failing row contains (wrong, null). +SELECT * FROM serialTest1; + f1 | f2 +-------+----- + foo | 1 + bar | 2 + force | 100 +(3 rows) + +SELECT pg_get_serial_sequence('serialTest1', 'f2'); + pg_get_serial_sequence +--------------------------- + public.serialtest1_f2_seq +(1 row) + +-- test smallserial / bigserial +CREATE TABLE serialTest2 (f1 text, f2 serial, f3 smallserial, f4 serial2, + f5 bigserial, f6 serial8); +INSERT INTO serialTest2 (f1) + VALUES ('test_defaults'); +INSERT INTO serialTest2 (f1, f2, f3, f4, f5, f6) + VALUES ('test_max_vals', 2147483647, 32767, 32767, 9223372036854775807, + 9223372036854775807), + ('test_min_vals', -2147483648, -32768, -32768, -9223372036854775808, + -9223372036854775808); +-- All these INSERTs should fail: +INSERT INTO serialTest2 (f1, f3) + VALUES ('bogus', -32769); +ERROR: smallint out of range +INSERT INTO serialTest2 (f1, f4) + VALUES ('bogus', -32769); +ERROR: smallint out of range +INSERT INTO serialTest2 (f1, f3) + VALUES ('bogus', 32768); +ERROR: smallint out of range +INSERT INTO serialTest2 (f1, f4) + VALUES ('bogus', 32768); +ERROR: smallint out of range +INSERT INTO serialTest2 (f1, f5) + VALUES ('bogus', -9223372036854775809); +ERROR: bigint out of range +INSERT INTO serialTest2 (f1, f6) + VALUES ('bogus', -9223372036854775809); +ERROR: bigint out of range +INSERT INTO serialTest2 (f1, f5) + VALUES ('bogus', 9223372036854775808); +ERROR: bigint out of range +INSERT INTO serialTest2 (f1, f6) + VALUES ('bogus', 9223372036854775808); +ERROR: bigint out of range +SELECT * FROM serialTest2 ORDER BY f2 ASC; + f1 | f2 | f3 | f4 | f5 | f6 +---------------+-------------+--------+--------+----------------------+---------------------- + test_min_vals | -2147483648 | -32768 | -32768 | -9223372036854775808 | -9223372036854775808 + test_defaults | 1 | 1 | 1 | 1 | 1 + test_max_vals | 2147483647 | 32767 | 32767 | 9223372036854775807 | 9223372036854775807 +(3 rows) + +SELECT nextval('serialTest2_f2_seq'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('serialTest2_f3_seq'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('serialTest2_f4_seq'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('serialTest2_f5_seq'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('serialTest2_f6_seq'); + nextval +--------- + 2 +(1 row) + +-- basic sequence operations using both text and oid references +CREATE SEQUENCE sequence_test; +CREATE SEQUENCE IF NOT EXISTS sequence_test; +NOTICE: relation "sequence_test" already exists, skipping +SELECT nextval('sequence_test'::text); + nextval +--------- + 1 +(1 row) + +SELECT nextval('sequence_test'::regclass); + nextval +--------- + 2 +(1 row) + +SELECT currval('sequence_test'::text); + currval +--------- + 2 +(1 row) + +SELECT currval('sequence_test'::regclass); + currval +--------- + 2 +(1 row) + +SELECT setval('sequence_test'::text, 32); + setval +-------- + 32 +(1 row) + +SELECT nextval('sequence_test'::regclass); + nextval +--------- + 33 +(1 row) + +SELECT setval('sequence_test'::text, 99, false); + setval +-------- + 99 +(1 row) + +SELECT nextval('sequence_test'::regclass); + nextval +--------- + 99 +(1 row) + +SELECT setval('sequence_test'::regclass, 32); + setval +-------- + 32 +(1 row) + +SELECT nextval('sequence_test'::text); + nextval +--------- + 33 +(1 row) + +SELECT setval('sequence_test'::regclass, 99, false); + setval +-------- + 99 +(1 row) + +SELECT nextval('sequence_test'::text); + nextval +--------- + 99 +(1 row) + +DISCARD SEQUENCES; +SELECT currval('sequence_test'::regclass); +ERROR: currval of sequence "sequence_test" is not yet defined in this session +DROP SEQUENCE sequence_test; +-- renaming sequences +CREATE SEQUENCE foo_seq; +ALTER TABLE foo_seq RENAME TO foo_seq_new; +SELECT * FROM foo_seq_new; + last_value | log_cnt | is_called +------------+---------+----------- + 1 | 0 | f +(1 row) + +SELECT nextval('foo_seq_new'); + nextval +--------- + 1 +(1 row) + +SELECT nextval('foo_seq_new'); + nextval +--------- + 2 +(1 row) + +-- log_cnt can be higher if there is a checkpoint just at the right +-- time, so just test for the expected range +SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; + last_value | log_cnt_ok | is_called +------------+------------+----------- + 2 | f | t +(1 row) + +DROP SEQUENCE foo_seq_new; +-- renaming serial sequences +ALTER TABLE serialtest1_f2_seq RENAME TO serialtest1_f2_foo; +INSERT INTO serialTest1 VALUES ('more'); +SELECT * FROM serialTest1; + f1 | f2 +-------+----- + foo | 1 + bar | 2 + force | 100 + more | 3 +(4 rows) + +-- +-- Check dependencies of serial and ordinary sequences +-- +CREATE TEMP SEQUENCE myseq2; +CREATE TEMP SEQUENCE myseq3; +CREATE TEMP TABLE t1 ( + f1 serial, + f2 int DEFAULT nextval('myseq2'), + f3 int DEFAULT nextval('myseq3'::text) +); +-- Both drops should fail, but with different error messages: +DROP SEQUENCE t1_f1_seq; +ERROR: cannot drop sequence t1_f1_seq because other objects depend on it +DETAIL: default value for column f1 of table t1 depends on sequence t1_f1_seq +HINT: Use DROP ... CASCADE to drop the dependent objects too. +DROP SEQUENCE myseq2; +ERROR: cannot drop sequence myseq2 because other objects depend on it +DETAIL: default value for column f2 of table t1 depends on sequence myseq2 +HINT: Use DROP ... CASCADE to drop the dependent objects too. +-- This however will work: +DROP SEQUENCE myseq3; +DROP TABLE t1; +-- Fails because no longer existent: +DROP SEQUENCE t1_f1_seq; +ERROR: sequence "t1_f1_seq" does not exist +-- Now OK: +DROP SEQUENCE myseq2; +-- +-- Alter sequence +-- +ALTER SEQUENCE IF EXISTS sequence_test2 RESTART WITH 24 + INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE; +NOTICE: relation "sequence_test2" does not exist, skipping +ALTER SEQUENCE serialTest1 CYCLE; -- error, not a sequence +ERROR: "serialtest1" is not a sequence +CREATE SEQUENCE sequence_test2 START WITH 32; +CREATE SEQUENCE sequence_test4 INCREMENT BY -1; +SELECT nextval('sequence_test2'); + nextval +--------- + 32 +(1 row) + +SELECT nextval('sequence_test4'); + nextval +--------- + -1 +(1 row) + +ALTER SEQUENCE sequence_test2 RESTART; +SELECT nextval('sequence_test2'); + nextval +--------- + 32 +(1 row) + +ALTER SEQUENCE sequence_test2 RESTART WITH 0; -- error +ERROR: RESTART value (0) cannot be less than MINVALUE (1) +ALTER SEQUENCE sequence_test4 RESTART WITH 40; -- error +ERROR: RESTART value (40) cannot be greater than MAXVALUE (-1) +-- test CYCLE and NO CYCLE +ALTER SEQUENCE sequence_test2 RESTART WITH 24 + INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE; +SELECT nextval('sequence_test2'); + nextval +--------- + 24 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 28 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 32 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 36 +(1 row) + +SELECT nextval('sequence_test2'); -- cycled + nextval +--------- + 5 +(1 row) + +ALTER SEQUENCE sequence_test2 RESTART WITH 24 + NO CYCLE; +SELECT nextval('sequence_test2'); + nextval +--------- + 24 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 28 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 32 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 36 +(1 row) + +SELECT nextval('sequence_test2'); -- error +ERROR: nextval: reached maximum value of sequence "sequence_test2" (36) +ALTER SEQUENCE sequence_test2 RESTART WITH -24 START WITH -24 + INCREMENT BY -4 MINVALUE -36 MAXVALUE -5 CYCLE; +SELECT nextval('sequence_test2'); + nextval +--------- + -24 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -28 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -32 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -36 +(1 row) + +SELECT nextval('sequence_test2'); -- cycled + nextval +--------- + -5 +(1 row) + +ALTER SEQUENCE sequence_test2 RESTART WITH -24 + NO CYCLE; +SELECT nextval('sequence_test2'); + nextval +--------- + -24 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -28 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -32 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -36 +(1 row) + +SELECT nextval('sequence_test2'); -- error +ERROR: nextval: reached minimum value of sequence "sequence_test2" (-36) +-- reset +ALTER SEQUENCE IF EXISTS sequence_test2 RESTART WITH 32 START WITH 32 + INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE; +SELECT setval('sequence_test2', -100); -- error +ERROR: setval: value -100 is out of bounds for sequence "sequence_test2" (5..36) +SELECT setval('sequence_test2', 100); -- error +ERROR: setval: value 100 is out of bounds for sequence "sequence_test2" (5..36) +SELECT setval('sequence_test2', 5); + setval +-------- + 5 +(1 row) + +CREATE SEQUENCE sequence_test3; -- not read from, to test is_called +-- Information schema +SELECT * FROM information_schema.sequences + WHERE sequence_name ~ ANY(ARRAY['sequence_test', 'serialtest']) + ORDER BY sequence_name ASC; + sequence_catalog | sequence_schema | sequence_name | data_type | numeric_precision | numeric_precision_radix | numeric_scale | start_value | minimum_value | maximum_value | increment | cycle_option +------------------+-----------------+--------------------+-----------+-------------------+-------------------------+---------------+-------------+----------------------+---------------------+-----------+-------------- + regression | public | sequence_test10 | smallint | 16 | 2 | 0 | 1 | -20000 | 32767 | 1 | NO + regression | public | sequence_test11 | integer | 32 | 2 | 0 | 1 | 1 | 2147483647 | 1 | NO + regression | public | sequence_test12 | integer | 32 | 2 | 0 | -1 | -2147483648 | -1 | -1 | NO + regression | public | sequence_test13 | integer | 32 | 2 | 0 | -32768 | -2147483648 | 2147483647 | 1 | NO + regression | public | sequence_test14 | integer | 32 | 2 | 0 | 32767 | -2147483648 | 2147483647 | -1 | NO + regression | public | sequence_test2 | bigint | 64 | 2 | 0 | 32 | 5 | 36 | 4 | YES + regression | public | sequence_test3 | bigint | 64 | 2 | 0 | 1 | 1 | 9223372036854775807 | 1 | NO + regression | public | sequence_test4 | bigint | 64 | 2 | 0 | -1 | -9223372036854775808 | -1 | -1 | NO + regression | public | sequence_test5 | smallint | 16 | 2 | 0 | 1 | 1 | 32767 | 1 | NO + regression | public | sequence_test6 | smallint | 16 | 2 | 0 | 1 | 1 | 32767 | 1 | NO + regression | public | sequence_test7 | bigint | 64 | 2 | 0 | 1 | 1 | 9223372036854775807 | 1 | NO + regression | public | sequence_test8 | smallint | 16 | 2 | 0 | 1 | 1 | 20000 | 1 | NO + regression | public | sequence_test9 | smallint | 16 | 2 | 0 | -1 | -32768 | -1 | -1 | NO + regression | public | serialtest1_f2_foo | integer | 32 | 2 | 0 | 1 | 1 | 2147483647 | 1 | NO + regression | public | serialtest2_f2_seq | integer | 32 | 2 | 0 | 1 | 1 | 2147483647 | 1 | NO + regression | public | serialtest2_f3_seq | smallint | 16 | 2 | 0 | 1 | 1 | 32767 | 1 | NO + regression | public | serialtest2_f4_seq | smallint | 16 | 2 | 0 | 1 | 1 | 32767 | 1 | NO + regression | public | serialtest2_f5_seq | bigint | 64 | 2 | 0 | 1 | 1 | 9223372036854775807 | 1 | NO + regression | public | serialtest2_f6_seq | bigint | 64 | 2 | 0 | 1 | 1 | 9223372036854775807 | 1 | NO +(19 rows) + +SELECT schemaname, sequencename, start_value, min_value, max_value, increment_by, cycle, cache_size, last_value +FROM pg_sequences +WHERE sequencename ~ ANY(ARRAY['sequence_test', 'serialtest']) + ORDER BY sequencename ASC; + schemaname | sequencename | start_value | min_value | max_value | increment_by | cycle | cache_size | last_value +------------+--------------------+-------------+----------------------+---------------------+--------------+-------+------------+------------ + public | sequence_test10 | 1 | -20000 | 32767 | 1 | f | 1 | + public | sequence_test11 | 1 | 1 | 2147483647 | 1 | f | 1 | + public | sequence_test12 | -1 | -2147483648 | -1 | -1 | f | 1 | + public | sequence_test13 | -32768 | -2147483648 | 2147483647 | 1 | f | 1 | + public | sequence_test14 | 32767 | -2147483648 | 2147483647 | -1 | f | 1 | + public | sequence_test2 | 32 | 5 | 36 | 4 | t | 1 | 5 + public | sequence_test3 | 1 | 1 | 9223372036854775807 | 1 | f | 1 | + public | sequence_test4 | -1 | -9223372036854775808 | -1 | -1 | f | 1 | -1 + public | sequence_test5 | 1 | 1 | 32767 | 1 | f | 1 | + public | sequence_test6 | 1 | 1 | 32767 | 1 | f | 1 | + public | sequence_test7 | 1 | 1 | 9223372036854775807 | 1 | f | 1 | + public | sequence_test8 | 1 | 1 | 20000 | 1 | f | 1 | + public | sequence_test9 | -1 | -32768 | -1 | -1 | f | 1 | + public | serialtest1_f2_foo | 1 | 1 | 2147483647 | 1 | f | 1 | 3 + public | serialtest2_f2_seq | 1 | 1 | 2147483647 | 1 | f | 1 | 2 + public | serialtest2_f3_seq | 1 | 1 | 32767 | 1 | f | 1 | 2 + public | serialtest2_f4_seq | 1 | 1 | 32767 | 1 | f | 1 | 2 + public | serialtest2_f5_seq | 1 | 1 | 9223372036854775807 | 1 | f | 1 | 2 + public | serialtest2_f6_seq | 1 | 1 | 9223372036854775807 | 1 | f | 1 | 2 +(19 rows) + +SELECT * FROM pg_sequence_parameters('sequence_test4'::regclass); + start_value | minimum_value | maximum_value | increment | cycle_option | cache_size | data_type +-------------+----------------------+---------------+-----------+--------------+------------+----------- + -1 | -9223372036854775808 | -1 | -1 | f | 1 | 20 +(1 row) + +\d sequence_test4 + Sequence "public.sequence_test4" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +--------+-------+----------------------+---------+-----------+---------+------- + bigint | -1 | -9223372036854775808 | -1 | -1 | no | 1 + +\d serialtest2_f2_seq + Sequence "public.serialtest2_f2_seq" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +---------+-------+---------+------------+-----------+---------+------- + integer | 1 | 1 | 2147483647 | 1 | no | 1 +Owned by: public.serialtest2.f2 + +-- Test comments +COMMENT ON SEQUENCE asdf IS 'won''t work'; +ERROR: relation "asdf" does not exist +COMMENT ON SEQUENCE sequence_test2 IS 'will work'; +COMMENT ON SEQUENCE sequence_test2 IS NULL; +-- Test lastval() +CREATE SEQUENCE seq; +SELECT nextval('seq'); + nextval +--------- + 1 +(1 row) + +SELECT lastval(); + lastval +--------- + 1 +(1 row) + +SELECT setval('seq', 99); + setval +-------- + 99 +(1 row) + +SELECT lastval(); + lastval +--------- + 99 +(1 row) + +DISCARD SEQUENCES; +SELECT lastval(); +ERROR: lastval is not yet defined in this session +CREATE SEQUENCE seq2; +SELECT nextval('seq2'); + nextval +--------- + 1 +(1 row) + +SELECT lastval(); + lastval +--------- + 1 +(1 row) + +DROP SEQUENCE seq2; +-- should fail +SELECT lastval(); +ERROR: lastval is not yet defined in this session +-- Test sequences in read-only transactions +CREATE TEMPORARY SEQUENCE sequence_test_temp1; +START TRANSACTION READ ONLY; +SELECT nextval('sequence_test_temp1'); -- ok + nextval +--------- + 1 +(1 row) + +SELECT nextval('sequence_test2'); -- error +ERROR: cannot execute nextval() in a read-only transaction +ROLLBACK; +START TRANSACTION READ ONLY; +SELECT setval('sequence_test_temp1', 1); -- ok + setval +-------- + 1 +(1 row) + +SELECT setval('sequence_test2', 1); -- error +ERROR: cannot execute setval() in a read-only transaction +ROLLBACK; +-- privileges tests +CREATE USER regress_seq_user; +-- nextval +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT SELECT ON seq3 TO regress_seq_user; +SELECT nextval('seq3'); +ERROR: permission denied for sequence seq3 +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT UPDATE ON seq3 TO regress_seq_user; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT USAGE ON seq3 TO regress_seq_user; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +ROLLBACK; +-- currval +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT SELECT ON seq3 TO regress_seq_user; +SELECT currval('seq3'); + currval +--------- + 1 +(1 row) + +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT UPDATE ON seq3 TO regress_seq_user; +SELECT currval('seq3'); +ERROR: permission denied for sequence seq3 +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT USAGE ON seq3 TO regress_seq_user; +SELECT currval('seq3'); + currval +--------- + 1 +(1 row) + +ROLLBACK; +-- lastval +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT SELECT ON seq3 TO regress_seq_user; +SELECT lastval(); + lastval +--------- + 1 +(1 row) + +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT UPDATE ON seq3 TO regress_seq_user; +SELECT lastval(); +ERROR: permission denied for sequence seq3 +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT USAGE ON seq3 TO regress_seq_user; +SELECT lastval(); + lastval +--------- + 1 +(1 row) + +ROLLBACK; +-- setval +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +REVOKE ALL ON seq3 FROM regress_seq_user; +SAVEPOINT save; +SELECT setval('seq3', 5); +ERROR: permission denied for sequence seq3 +ROLLBACK TO save; +GRANT UPDATE ON seq3 TO regress_seq_user; +SELECT setval('seq3', 5); + setval +-------- + 5 +(1 row) + +SELECT nextval('seq3'); + nextval +--------- + 6 +(1 row) + +ROLLBACK; +-- ALTER SEQUENCE +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +ALTER SEQUENCE sequence_test2 START WITH 1; +ERROR: must be owner of sequence sequence_test2 +ROLLBACK; +-- Sequences should get wiped out as well: +DROP TABLE serialTest1, serialTest2; +-- Make sure sequences are gone: +SELECT * FROM information_schema.sequences WHERE sequence_name IN + ('sequence_test2', 'serialtest2_f2_seq', 'serialtest2_f3_seq', + 'serialtest2_f4_seq', 'serialtest2_f5_seq', 'serialtest2_f6_seq') + ORDER BY sequence_name ASC; + sequence_catalog | sequence_schema | sequence_name | data_type | numeric_precision | numeric_precision_radix | numeric_scale | start_value | minimum_value | maximum_value | increment | cycle_option +------------------+-----------------+----------------+-----------+-------------------+-------------------------+---------------+-------------+---------------+---------------+-----------+-------------- + regression | public | sequence_test2 | bigint | 64 | 2 | 0 | 32 | 5 | 36 | 4 | YES +(1 row) + +DROP USER regress_seq_user; +DROP SEQUENCE seq; +-- cache tests +CREATE SEQUENCE test_seq1 CACHE 10; +SELECT nextval('test_seq1'); + nextval +--------- + 1 +(1 row) + +SELECT nextval('test_seq1'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('test_seq1'); + nextval +--------- + 3 +(1 row) + +DROP SEQUENCE test_seq1; diff --git a/src/test/regress/output/tablespace_1.source b/src/test/regress/output/tablespace_1.source new file mode 100644 index 00000000000..1c3b75cb6d1 --- /dev/null +++ b/src/test/regress/output/tablespace_1.source @@ -0,0 +1,941 @@ +-- create a tablespace using WITH clause +CREATE TABLESPACE regress_tblspacewith LOCATION '@testtablespace@' WITH (some_nonexistent_parameter = true); -- fail +ERROR: unrecognized parameter "some_nonexistent_parameter" +CREATE TABLESPACE regress_tblspacewith LOCATION '@testtablespace@' WITH (random_page_cost = 3.0); -- ok +-- check to see the parameter was used +SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith'; + spcoptions +------------------------ + {random_page_cost=3.0} +(1 row) + +-- drop the tablespace so we can re-use the location +DROP TABLESPACE regress_tblspacewith; +-- create a tablespace we can use +CREATE TABLESPACE regress_tblspace LOCATION '@testtablespace@'; +-- try setting and resetting some properties for the new tablespace +ALTER TABLESPACE regress_tblspace SET (random_page_cost = 1.0, seq_page_cost = 1.1); +ALTER TABLESPACE regress_tblspace SET (some_nonexistent_parameter = true); -- fail +ERROR: unrecognized parameter "some_nonexistent_parameter" +ALTER TABLESPACE regress_tblspace RESET (random_page_cost = 2.0); -- fail +ERROR: RESET must not include values for parameters +ALTER TABLESPACE regress_tblspace RESET (random_page_cost, effective_io_concurrency); -- ok +-- REINDEX (TABLESPACE) +-- catalogs and system tablespaces +-- system catalog, fail +REINDEX (TABLESPACE regress_tblspace) TABLE pg_am; +ERROR: cannot move system relation "pg_am_name_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_am; +ERROR: cannot reindex system catalogs concurrently +-- shared catalog, fail +REINDEX (TABLESPACE regress_tblspace) TABLE pg_authid; +ERROR: cannot move system relation "pg_authid_rolname_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_authid; +ERROR: cannot reindex system catalogs concurrently +-- toast relations, fail +REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1260_index; +ERROR: cannot move system relation "pg_toast_1260_index" +REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1260_index; +ERROR: cannot reindex system catalogs concurrently +REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1260; +ERROR: cannot move system relation "pg_toast_1260_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1260; +ERROR: cannot reindex system catalogs concurrently +-- system catalog, fail +REINDEX (TABLESPACE pg_global) TABLE pg_authid; +ERROR: cannot move system relation "pg_authid_rolname_index" +REINDEX (TABLESPACE pg_global) TABLE CONCURRENTLY pg_authid; +ERROR: cannot reindex system catalogs concurrently +-- table with toast relation +CREATE TABLE regress_tblspace_test_tbl (num1 bigint, num2 double precision, t text); +INSERT INTO regress_tblspace_test_tbl (num1, num2, t) + SELECT round(random()*100), random(), 'text' + FROM generate_series(1, 10) s(i); +CREATE INDEX regress_tblspace_test_tbl_idx ON regress_tblspace_test_tbl (num1); +-- move to global tablespace, fail +REINDEX (TABLESPACE pg_global) INDEX regress_tblspace_test_tbl_idx; +ERROR: only shared relations can be placed in pg_global tablespace +REINDEX (TABLESPACE pg_global) INDEX CONCURRENTLY regress_tblspace_test_tbl_idx; +ERROR: cannot move non-shared relation to tablespace "pg_global" +-- check transactional behavior of REINDEX (TABLESPACE) +BEGIN; +REINDEX (TABLESPACE regress_tblspace) INDEX regress_tblspace_test_tbl_idx; +REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl; +ROLLBACK; +-- no relation moved to the new tablespace +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'; + relname +--------- +(0 rows) + +-- check that all indexes are moved to a new tablespace with different +-- relfilenode. +-- Save first the existing relfilenode for the toast and main relations. +SELECT relfilenode as main_filenode FROM pg_class + WHERE relname = 'regress_tblspace_test_tbl_idx' \gset +SELECT relfilenode as toast_filenode FROM pg_class + WHERE oid = + (SELECT i.indexrelid + FROM pg_class c, + pg_index i + WHERE i.indrelid = c.reltoastrelid AND + c.relname = 'regress_tblspace_test_tbl') \gset +REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE regress_tblspace; +ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE pg_default; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +-- Move back to the default tablespace. +ALTER INDEX regress_tblspace_test_tbl_idx SET TABLESPACE pg_default; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +--------- +(0 rows) + +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE regress_tblspace_test_tbl; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +SELECT relfilenode = :main_filenode AS main_same FROM pg_class + WHERE relname = 'regress_tblspace_test_tbl_idx'; + main_same +----------- + f +(1 row) + +SELECT relfilenode = :toast_filenode as toast_same FROM pg_class + WHERE oid = + (SELECT i.indexrelid + FROM pg_class c, + pg_index i + WHERE i.indrelid = c.reltoastrelid AND + c.relname = 'regress_tblspace_test_tbl'); + toast_same +------------ + f +(1 row) + +DROP TABLE regress_tblspace_test_tbl; +-- REINDEX (TABLESPACE) with partitions +-- Create a partition tree and check the set of relations reindexed +-- with their new tablespace. +CREATE TABLE tbspace_reindex_part (c1 int, c2 int) PARTITION BY RANGE (c1); +CREATE TABLE tbspace_reindex_part_0 PARTITION OF tbspace_reindex_part + FOR VALUES FROM (0) TO (10) PARTITION BY list (c2); +CREATE TABLE tbspace_reindex_part_0_1 PARTITION OF tbspace_reindex_part_0 + FOR VALUES IN (1); +CREATE TABLE tbspace_reindex_part_0_2 PARTITION OF tbspace_reindex_part_0 + FOR VALUES IN (2); +-- This partitioned table will have no partitions. +CREATE TABLE tbspace_reindex_part_10 PARTITION OF tbspace_reindex_part + FOR VALUES FROM (10) TO (20) PARTITION BY list (c2); +-- Create some partitioned indexes +CREATE INDEX tbspace_reindex_part_index ON ONLY tbspace_reindex_part (c1); +CREATE INDEX tbspace_reindex_part_index_0 ON ONLY tbspace_reindex_part_0 (c1); +ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_0; +-- This partitioned index will have no partitions. +CREATE INDEX tbspace_reindex_part_index_10 ON ONLY tbspace_reindex_part_10 (c1); +ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_10; +CREATE INDEX tbspace_reindex_part_index_0_1 ON ONLY tbspace_reindex_part_0_1 (c1); +ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_1; +CREATE INDEX tbspace_reindex_part_index_0_2 ON ONLY tbspace_reindex_part_0_2 (c1); +ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_2; +SELECT relid, parentrelid, level FROM pg_partition_tree('tbspace_reindex_part_index') + ORDER BY relid, level; + relid | parentrelid | level +--------------------------------+------------------------------+------- + tbspace_reindex_part_index | | 0 + tbspace_reindex_part_index_0 | tbspace_reindex_part_index | 1 + tbspace_reindex_part_index_10 | tbspace_reindex_part_index | 1 + tbspace_reindex_part_index_0_1 | tbspace_reindex_part_index_0 | 2 + tbspace_reindex_part_index_0_2 | tbspace_reindex_part_index_0 | 2 +(5 rows) + +-- Track the original tablespace, relfilenode and OID of each index +-- in the tree. +CREATE TEMP TABLE reindex_temp_before AS + SELECT oid, relname, relfilenode, reltablespace + FROM pg_class + WHERE relname ~ 'tbspace_reindex_part_index'; +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tbspace_reindex_part; +-- REINDEX CONCURRENTLY changes the OID of the old relation, hence a check +-- based on the relation name below. +SELECT b.relname, + CASE WHEN a.relfilenode = b.relfilenode THEN 'relfilenode is unchanged' + ELSE 'relfilenode has changed' END AS filenode, + CASE WHEN a.reltablespace = b.reltablespace THEN 'reltablespace is unchanged' + ELSE 'reltablespace has changed' END AS tbspace + FROM reindex_temp_before b JOIN pg_class a ON b.relname = a.relname + ORDER BY 1; + relname | filenode | tbspace +--------------------------------+--------------------------+---------------------------- + tbspace_reindex_part_index | relfilenode is unchanged | reltablespace is unchanged + tbspace_reindex_part_index_0 | relfilenode is unchanged | reltablespace is unchanged + tbspace_reindex_part_index_0_1 | relfilenode has changed | reltablespace has changed + tbspace_reindex_part_index_0_2 | relfilenode has changed | reltablespace has changed + tbspace_reindex_part_index_10 | relfilenode is unchanged | reltablespace is unchanged +(5 rows) + +DROP TABLE tbspace_reindex_part; +-- create a schema we can use +CREATE SCHEMA testschema; +-- try a table +CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'foo'; + relname | spcname +---------+------------------ + foo | regress_tblspace +(1 row) + +INSERT INTO testschema.foo VALUES(1); +INSERT INTO testschema.foo VALUES(2); +-- tables from dynamic sources +CREATE TABLE testschema.asselect TABLESPACE regress_tblspace AS SELECT 1; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'asselect'; + relname | spcname +----------+------------------ + asselect | regress_tblspace +(1 row) + +PREPARE selectsource(int) AS SELECT $1; +CREATE TABLE testschema.asexecute TABLESPACE regress_tblspace + AS EXECUTE selectsource(2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'asexecute'; + relname | spcname +-----------+------------------ + asexecute | regress_tblspace +(1 row) + +-- index +CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'foo_idx'; + relname | spcname +---------+------------------ + foo_idx | regress_tblspace +(1 row) + +-- check \d output +\d testschema.foo + Table "testschema.foo" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + i | integer | | | +Indexes: + "foo_idx" btree (i), tablespace "regress_tblspace" +Tablespace: "regress_tblspace" + +\d testschema.foo_idx + Index "testschema.foo_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + i | integer | yes | i +btree, for table "testschema.foo" +Tablespace: "regress_tblspace" + +-- +-- partitioned table +-- +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +ERROR: only shared relations can be placed in pg_global tablespace +RESET default_tablespace; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +SET default_tablespace TO regress_tblspace; +CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +ERROR: only shared relations can be placed in pg_global tablespace +ALTER TABLE testschema.part SET TABLESPACE regress_tblspace; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4) + TABLESPACE pg_default; +CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6) + PARTITION BY LIST (a); +ALTER TABLE testschema.part SET TABLESPACE pg_default; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +ERROR: only shared relations can be placed in pg_global tablespace +CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10) + PARTITION BY LIST (a) TABLESPACE regress_tblspace; +RESET default_tablespace; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +SELECT relname, spcname FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid) + LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid + where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname; + relname | spcname +----------+------------------ + part | + part_1 | + part_2 | regress_tblspace + part_3 | regress_tblspace + part_4 | + part_56 | regress_tblspace + part_78 | + part_910 | regress_tblspace +(8 rows) + +RESET default_tablespace; +DROP TABLE testschema.part; +-- partitioned index +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1); +CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx'; + relname | spcname +-------------+------------------ + part1_a_idx | regress_tblspace + part2_a_idx | regress_tblspace + part_a_idx | regress_tblspace +(3 rows) + +\d testschema.part + Partitioned table "testschema.part" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition key: LIST (a) +Indexes: + "part_a_idx" btree (a), tablespace "regress_tblspace" +Number of partitions: 2 (Use \d+ to list them.) + +\d+ testschema.part + Partitioned table "testschema.part" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition key: LIST (a) +Indexes: + "part_a_idx" btree (a), tablespace "regress_tblspace" +Partitions: testschema.part1 FOR VALUES IN (1), + testschema.part2 FOR VALUES IN (2) + +\d testschema.part1 + Table "testschema.part1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: testschema.part FOR VALUES IN (1) +Indexes: + "part1_a_idx" btree (a), tablespace "regress_tblspace" + +\d+ testschema.part1 + Table "testschema.part1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition of: testschema.part FOR VALUES IN (1) +Partition constraint: ((a IS NOT NULL) AND (a = 1)) +Indexes: + "part1_a_idx" btree (a), tablespace "regress_tblspace" + +\d testschema.part_a_idx +Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.part" +Number of partitions: 2 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d+ testschema.part_a_idx + Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+---------+------+------------+---------+-------------- + a | integer | yes | a | plain | +btree, for table "testschema.part" +Partitions: testschema.part1_a_idx, + testschema.part2_a_idx +Tablespace: "regress_tblspace" + +-- partitioned rels cannot specify the default tablespace. These fail: +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +SET default_tablespace TO 'pg_default'; +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +-- but these work: +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +SET default_tablespace TO ''; +CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a); +DROP TABLE testschema.dflt, testschema.dflt2; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; +INSERT INTO testschema.test_default_tab VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab (id); +CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab; + id +---- + 1 +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE int; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab; + id +---- + 1 +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE int; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +-- (this time with a partitioned table) +CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint) + PARTITION BY LIST (id) TABLESPACE regress_tblspace; +CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p + FOR VALUES IN (1); +INSERT INTO testschema.test_default_tab_p VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab_p (val); +CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab_p; +-- check that default_tablespace affects index additions in ALTER TABLE +CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace; +INSERT INTO testschema.test_tab VALUES (1); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (id); +SET default_tablespace TO ''; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_pkey + Index "testschema.test_tab_pkey" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_tab" + +SELECT * FROM testschema.test_tab; + id +---- + 1 +(1 row) + +DROP TABLE testschema.test_tab; +-- check that default_tablespace is handled correctly by multi-command +-- ALTER TABLE that includes a tablespace-preserving rewrite +CREATE TABLE testschema.test_tab(a int, b int, c int); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a); +CREATE INDEX test_tab_a_idx ON testschema.test_tab (a); +SET default_tablespace TO ''; +CREATE INDEX test_tab_b_idx ON testschema.test_tab (b); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + b | integer | yes | b +btree, for table "testschema.test_tab" + +ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+--------+------+------------ + b | bigint | yes | b +btree, for table "testschema.test_tab" + +DROP TABLE testschema.test_tab; +-- let's try moving a table from one place to another +CREATE TABLE testschema.atable AS VALUES (1), (2); +CREATE UNIQUE INDEX anindex ON testschema.atable(column1); +ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global; +ERROR: only shared relations can be placed in pg_global tablespace +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace; +INSERT INTO testschema.atable VALUES(3); -- ok +INSERT INTO testschema.atable VALUES(1); -- fail (checks index) +ERROR: duplicate key value violates unique constraint "anindex" +DETAIL: Key (column1)=(1) already exists. +SELECT COUNT(*) FROM testschema.atable; -- checks heap + count +------- + 3 +(1 row) + +-- Will fail with bad path +CREATE TABLESPACE regress_badspace LOCATION '/no/such/location'; +ERROR: directory "/no/such/location" does not exist +-- No such tablespace +CREATE TABLE bar (i int) TABLESPACE regress_nosuchspace; +ERROR: tablespace "regress_nosuchspace" does not exist +-- Fail, in use for some partitioned object +DROP TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" cannot be dropped because some objects depend on it +DETAIL: tablespace for index testschema.part_a_idx +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +-- Fail, not empty +DROP TABLESPACE regress_tblspace; +CREATE ROLE regress_tablespace_user1 login; +CREATE ROLE regress_tablespace_user2 login; +GRANT USAGE ON SCHEMA testschema TO regress_tablespace_user2; +ALTER TABLESPACE regress_tblspace OWNER TO regress_tablespace_user1; +ERROR: tablespace "regress_tblspace" does not exist +CREATE TABLE testschema.tablespace_acl (c int); +-- new owner lacks permission to create this index from scratch +CREATE INDEX k ON testschema.tablespace_acl (c) TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE testschema.tablespace_acl OWNER TO regress_tablespace_user2; +SET SESSION ROLE regress_tablespace_user2; +CREATE TABLE tablespace_table (i int) TABLESPACE regress_tblspace; -- fail +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE testschema.tablespace_acl ALTER c TYPE bigint; +REINDEX (TABLESPACE regress_tblspace) TABLE tablespace_table; -- fail +ERROR: tablespace "regress_tblspace" does not exist +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tablespace_table; -- fail +ERROR: tablespace "regress_tblspace" does not exist +RESET ROLE; +ALTER TABLESPACE regress_tblspace RENAME TO regress_tblspace_renamed; +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER INDEX ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +-- Should show notice that nothing was done +ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +-- Should succeed +DROP TABLESPACE regress_tblspace_renamed; +ERROR: tablespace "regress_tblspace_renamed" does not exist +DROP SCHEMA testschema CASCADE; +NOTICE: drop cascades to 6 other objects +DETAIL: drop cascades to table testschema.foo +drop cascades to table testschema.asselect +drop cascades to table testschema.asexecute +drop cascades to table testschema.part +drop cascades to table testschema.atable +drop cascades to table testschema.tablespace_acl +DROP ROLE regress_tablespace_user1; +DROP ROLE regress_tablespace_user2; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 826d5d6e083..c10c4f5e666 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2252,6 +2252,8 @@ RelMapFile RelMapping RelOptInfo RelOptKind +RelSizeEntry +RelTag RelToCheck RelToCluster RelabelType @@ -2942,6 +2944,8 @@ WaitPMResult WalCloseMethod WalCompression WalLevel +Safekeeper +WalMessage WalRcvData WalRcvExecResult WalRcvExecStatus @@ -3053,6 +3057,17 @@ XmlTableBuilderData YYLTYPE YYSTYPE YY_BUFFER_STATE +ZenithErrorResponse +ZenithExistsRequest +ZenithExistsResponse +ZenithGetPageRequest +ZenithGetPageResponse +ZenithMessage +ZenithMessageTag +ZenithNblocksRequest +ZenithNblocksResponse +ZenithRequest +ZenithResponse ZSTD_CCtx ZSTD_DCtx ZSTD_inBuffer From 8af27f3330154a890a578b5ead8a9789d751d059 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 10 Aug 2022 18:04:24 +0300 Subject: [PATCH 02/56] fix regression tests --- src/test/regress/expected/sequence_1.out | 23 ++++++++-- .../tablespace_1.out} | 43 +++++++++++++++++-- 2 files changed, 59 insertions(+), 7 deletions(-) rename src/test/regress/{output/tablespace_1.source => expected/tablespace_1.out} (95%) diff --git a/src/test/regress/expected/sequence_1.out b/src/test/regress/expected/sequence_1.out index 462e3f3caa4..3c1a7a325d8 100644 --- a/src/test/regress/expected/sequence_1.out +++ b/src/test/regress/expected/sequence_1.out @@ -2,8 +2,6 @@ -- CREATE SEQUENCE -- -- various error cases -CREATE UNLOGGED SEQUENCE sequence_testx; -ERROR: unlogged sequences are not supported CREATE SEQUENCE sequence_testx INCREMENT BY 0; ERROR: INCREMENT must not be zero CREATE SEQUENCE sequence_testx INCREMENT BY -1 MINVALUE 20; @@ -21,7 +19,8 @@ CREATE SEQUENCE sequence_testx OWNED BY nobody; -- nonsense word ERROR: invalid OWNED BY option HINT: Specify OWNED BY table.column or OWNED BY NONE. CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid; -- not a table -ERROR: referenced relation "pg_class_oid_index" is not a table or foreign table +ERROR: sequence cannot be owned by relation "pg_class_oid_index" +DETAIL: This operation is not supported for indexes. CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname; -- not same schema ERROR: sequence must be in same schema as table it is linked to CREATE TABLE sequence_test_table (a int); @@ -599,6 +598,24 @@ DROP SEQUENCE seq2; -- should fail SELECT lastval(); ERROR: lastval is not yet defined in this session +-- unlogged sequences +-- (more tests in src/test/recovery/) +CREATE UNLOGGED SEQUENCE sequence_test_unlogged; +ALTER SEQUENCE sequence_test_unlogged SET LOGGED; +\d sequence_test_unlogged + Sequence "public.sequence_test_unlogged" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +--------+-------+---------+---------------------+-----------+---------+------- + bigint | 1 | 1 | 9223372036854775807 | 1 | no | 1 + +ALTER SEQUENCE sequence_test_unlogged SET UNLOGGED; +\d sequence_test_unlogged + Unlogged sequence "public.sequence_test_unlogged" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +--------+-------+---------+---------------------+-----------+---------+------- + bigint | 1 | 1 | 9223372036854775807 | 1 | no | 1 + +DROP SEQUENCE sequence_test_unlogged; -- Test sequences in read-only transactions CREATE TEMPORARY SEQUENCE sequence_test_temp1; START TRANSACTION READ ONLY; diff --git a/src/test/regress/output/tablespace_1.source b/src/test/regress/expected/tablespace_1.out similarity index 95% rename from src/test/regress/output/tablespace_1.source rename to src/test/regress/expected/tablespace_1.out index 1c3b75cb6d1..f4c7e75060e 100644 --- a/src/test/regress/output/tablespace_1.source +++ b/src/test/regress/expected/tablespace_1.out @@ -1,7 +1,18 @@ +-- relative tablespace locations are not allowed +CREATE TABLESPACE regress_tblspace LOCATION 'relative'; -- fail +ERROR: tablespace location must be an absolute path +-- empty tablespace locations are not usually allowed +CREATE TABLESPACE regress_tblspace LOCATION ''; -- fail +ERROR: tablespace location must be an absolute path +-- as a special developer-only option to allow us to use tablespaces +-- with streaming replication on the same server, an empty location +-- can be allowed as a way to say that the tablespace should be created +-- as a directory in pg_tblspc, rather than being a symlink +SET allow_in_place_tablespaces = true; -- create a tablespace using WITH clause -CREATE TABLESPACE regress_tblspacewith LOCATION '@testtablespace@' WITH (some_nonexistent_parameter = true); -- fail +CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (some_nonexistent_parameter = true); -- fail ERROR: unrecognized parameter "some_nonexistent_parameter" -CREATE TABLESPACE regress_tblspacewith LOCATION '@testtablespace@' WITH (random_page_cost = 3.0); -- ok +CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (random_page_cost = 3.0); -- ok -- check to see the parameter was used SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith'; spcoptions @@ -12,7 +23,16 @@ SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith'; -- drop the tablespace so we can re-use the location DROP TABLESPACE regress_tblspacewith; -- create a tablespace we can use -CREATE TABLESPACE regress_tblspace LOCATION '@testtablespace@'; +CREATE TABLESPACE regress_tblspace LOCATION ''; +-- This returns a relative path as of an effect of allow_in_place_tablespaces, +-- masking the tablespace OID used in the path name. +SELECT regexp_replace(pg_tablespace_location(oid), '(pg_tblspc)/(\d+)', '\1/NNN') + FROM pg_tablespace WHERE spcname = 'regress_tblspace'; + regexp_replace +---------------- + pg_tblspc/NNN +(1 row) + -- try setting and resetting some properties for the new tablespace ALTER TABLESPACE regress_tblspace SET (random_page_cost = 1.0, seq_page_cost = 1.1); ALTER TABLESPACE regress_tblspace SET (some_nonexistent_parameter = true); -- fail @@ -885,6 +905,16 @@ SELECT COUNT(*) FROM testschema.atable; -- checks heap 3 (1 row) +-- let's try moving a materialized view from one place to another +CREATE MATERIALIZED VIEW testschema.amv AS SELECT * FROM testschema.atable; +ALTER MATERIALIZED VIEW testschema.amv SET TABLESPACE regress_tblspace; +REFRESH MATERIALIZED VIEW testschema.amv; +SELECT COUNT(*) FROM testschema.amv; + count +------- + 3 +(1 row) + -- Will fail with bad path CREATE TABLESPACE regress_badspace LOCATION '/no/such/location'; ERROR: directory "/no/such/location" does not exist @@ -923,19 +953,24 @@ ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default ERROR: tablespace "regress_tblspace_renamed" does not exist ALTER INDEX ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist -- Should show notice that nothing was done ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist -- Should succeed DROP TABLESPACE regress_tblspace_renamed; ERROR: tablespace "regress_tblspace_renamed" does not exist DROP SCHEMA testschema CASCADE; -NOTICE: drop cascades to 6 other objects +NOTICE: drop cascades to 7 other objects DETAIL: drop cascades to table testschema.foo drop cascades to table testschema.asselect drop cascades to table testschema.asexecute drop cascades to table testschema.part drop cascades to table testschema.atable +drop cascades to materialized view testschema.amv drop cascades to table testschema.tablespace_acl DROP ROLE regress_tablespace_user1; DROP ROLE regress_tablespace_user2; From c146655a437cd4bc49172803ea5a9401fb5e2cb5 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 16 Aug 2022 17:34:50 +0300 Subject: [PATCH 03/56] rebase to the latest origin and resolve conflicts --- src/backend/access/transam/xlog.c | 2 +- src/backend/commands/dbcommands.c | 2 +- src/backend/storage/buffer/bufmgr.c | 14 +++++++++----- src/backend/utils/cache/relcache.c | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index b6a6a982087..6a5f9331b0b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7988,7 +7988,7 @@ xlog_redo(XLogReaderState *record) * return BLK_DONE. Accept that. */ } - else if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) + else if (result != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); if (buffer != InvalidBuffer) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 93f0c739e55..71510007476 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -275,7 +275,7 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath) rnode.dbNode = dbid; rnode.relNode = relfilenode; - smgr = smgropen(rnode, InvalidBackendId); + smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); nblocks = smgrnblocks(smgr, MAIN_FORKNUM); smgrclose(smgr); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 3f5ba5942e2..808f3426851 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -805,7 +805,9 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, { bool hit; - SMgrRelation smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); + SMgrRelation smgr = smgropen(rnode, InvalidBackendId, + permanent ? RELPERSISTENCE_PERMANENT : + RELPERSISTENCE_UNLOGGED); return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, forkNum, blockNum, @@ -827,7 +829,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, bool found; bool isExtend; /* - * wal_redo postgres is working in single user mode, we do not need to synchronize access to shared buffer, + * wal_redo postgres is working in single user mode, we do not need to synchronize access to shared buffer, * so let's use local buffers instead */ bool isLocalBuf = SmgrIsTemp(smgr) || am_wal_redo_postgres; @@ -3760,7 +3762,9 @@ RelationCopyStorageUsingBuffer(RelFileNode srcnode, use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM); /* Get number of blocks in the source relation. */ - nblocks = smgrnblocks(smgropen(srcnode, InvalidBackendId), + nblocks = smgrnblocks(smgropen(srcnode, InvalidBackendId, + permanent ? RELPERSISTENCE_PERMANENT + :RELPERSISTENCE_UNLOGGED), forkNum); /* Nothing to copy; just return. */ @@ -3854,9 +3858,9 @@ CreateAndCopyRelationData(RelFileNode src_rnode, RelFileNode dst_rnode, for (ForkNumber forkNum = MAIN_FORKNUM + 1; forkNum <= MAX_FORKNUM; forkNum++) { - if (smgrexists(smgropen(src_rnode, InvalidBackendId), forkNum)) + if (smgrexists(smgropen(src_rnode, InvalidBackendId, relpersistence), forkNum)) { - smgrcreate(smgropen(dst_rnode, InvalidBackendId), forkNum, false); + smgrcreate(smgropen(dst_rnode, InvalidBackendId, relpersistence), forkNum, false); /* * WAL log creation if the relation is persistent, or this is the diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 0ce4400b321..2416116e97c 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3796,7 +3796,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence) * fails at this stage, the new cluster will need to be recreated * anyway. */ - srel = smgropen(relation->rd_node, relation->rd_backend); + srel = smgropen(relation->rd_node, relation->rd_backend, persistence); smgrdounlinkall(&srel, 1, false); smgrclose(srel); } From a41058d9fed97e3b629eb06d5ad91e4e1746113a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 24 Aug 2022 18:55:43 +0300 Subject: [PATCH 04/56] Remove contrib neon and neon_test_utils. --- contrib/neon/Makefile | 26 - contrib/neon/inmem_smgr.c | 287 -- contrib/neon/libpagestore.c | 440 --- contrib/neon/neon--1.0.sql | 17 - contrib/neon/neon.c | 66 - contrib/neon/neon.control | 4 - contrib/neon/pagestore_client.h | 221 -- contrib/neon/pagestore_smgr.c | 1698 ------------ contrib/neon/relsize_cache.c | 186 -- contrib/neon_test_utils/Makefile | 25 - .../neon_test_utils/neon_test_utils--1.0.sql | 29 - .../neon_test_utils/neon_test_utils.control | 5 - contrib/neon_test_utils/neontest.c | 304 --- src/Makefile | 1 - src/backend/access/transam/xloginsert.c | 2 +- src/backend/main/main.c | 2 +- src/backend/postmaster/bgworker.c | 5 +- src/backend/postmaster/postmaster.c | 7 +- src/backend/replication/Makefile | 3 +- .../replication/libpqwalproposer/Makefile | 37 - .../libpqwalproposer/libpqwalproposer.c | 416 --- src/backend/replication/walpropcompat.c | 95 + src/backend/replication/walproposer.c | 2357 ----------------- src/backend/replication/walproposer_utils.c | 404 --- src/backend/replication/walsender.c | 286 +- src/backend/storage/buffer/bufmgr.c | 3 +- src/backend/storage/ipc/ipci.c | 4 - src/backend/tcop/postgres.c | 19 +- src/backend/tcop/zenith_wal_redo.c | 1 + src/backend/utils/misc/guc.c | 37 +- src/include/access/xlog.h | 2 - src/include/access/xlogdefs.h | 8 - src/include/replication/walproposer.h | 565 ---- src/include/replication/walpropshim.h | 19 + src/include/replication/walsender.h | 21 +- 35 files changed, 231 insertions(+), 7371 deletions(-) delete mode 100644 contrib/neon/Makefile delete mode 100644 contrib/neon/inmem_smgr.c delete mode 100644 contrib/neon/libpagestore.c delete mode 100644 contrib/neon/neon--1.0.sql delete mode 100644 contrib/neon/neon.c delete mode 100644 contrib/neon/neon.control delete mode 100644 contrib/neon/pagestore_client.h delete mode 100644 contrib/neon/pagestore_smgr.c delete mode 100644 contrib/neon/relsize_cache.c delete mode 100644 contrib/neon_test_utils/Makefile delete mode 100644 contrib/neon_test_utils/neon_test_utils--1.0.sql delete mode 100644 contrib/neon_test_utils/neon_test_utils.control delete mode 100644 contrib/neon_test_utils/neontest.c delete mode 100644 src/backend/replication/libpqwalproposer/Makefile delete mode 100644 src/backend/replication/libpqwalproposer/libpqwalproposer.c create mode 100644 src/backend/replication/walpropcompat.c delete mode 100644 src/backend/replication/walproposer.c delete mode 100644 src/backend/replication/walproposer_utils.c delete mode 100644 src/include/replication/walproposer.h create mode 100644 src/include/replication/walpropshim.h diff --git a/contrib/neon/Makefile b/contrib/neon/Makefile deleted file mode 100644 index b6f3cf400ff..00000000000 --- a/contrib/neon/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -# contrib/neon/Makefile - - -MODULE_big = neon -OBJS = \ - $(WIN32RES) \ - inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o neon.o - -PG_CPPFLAGS = -I$(libpq_srcdir) -SHLIB_LINK_INTERNAL = $(libpq) - -EXTENSION = neon -DATA = neon--1.0.sql -PGFILEDESC = "neon - cloud storage for PostgreSQL" - -ifdef USE_PGXS -PG_CONFIG = pg_config -PGXS := $(shell $(PG_CONFIG) --pgxs) -include $(PGXS) -else -SHLIB_PREREQS = submake-libpq -subdir = contrib/neon -top_builddir = ../.. -include $(top_builddir)/src/Makefile.global -include $(top_srcdir)/contrib/contrib-global.mk -endif diff --git a/contrib/neon/inmem_smgr.c b/contrib/neon/inmem_smgr.c deleted file mode 100644 index ca3a254a143..00000000000 --- a/contrib/neon/inmem_smgr.c +++ /dev/null @@ -1,287 +0,0 @@ -/*------------------------------------------------------------------------- - * - * inmem_smgr.c - * - * This is an implementation of the SMGR interface, used in the WAL redo - * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent - * storage, the pages that are written out are kept in a small number of - * in-memory buffers. - * - * Normally, replaying a WAL record only needs to access a handful of - * buffers, which fit in the normal buffer cache, so this is just for - * "overflow" storage when the buffer cache is not large enough. - * - * - * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * IDENTIFICATION - * contrib/neon/inmem_smgr.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "access/xlog.h" -#include "access/xlogutils.h" -#include "pagestore_client.h" -#include "storage/block.h" -#include "storage/buf_internals.h" -#include "storage/relfilenode.h" -#include "storage/smgr.h" - -/* Size of the in-memory smgr */ -#define MAX_PAGES 64 - -/* If more than WARN_PAGES are used, print a warning in the log */ -#define WARN_PAGES 32 - -static BufferTag page_tag[MAX_PAGES]; -static char page_body[MAX_PAGES][BLCKSZ]; -static int used_pages; - -static int -locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno) -{ - /* We only hold a small number of pages, so linear search */ - for (int i = 0; i < used_pages; i++) - { - if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) - && forknum == page_tag[i].forkNum - && blkno == page_tag[i].blockNum) - { - return i; - } - } - return -1; -} - -/* - * inmem_init() -- Initialize private state - */ -void -inmem_init(void) -{ - used_pages = 0; -} - -/* - * inmem_exists() -- Does the physical file exist? - */ -bool -inmem_exists(SMgrRelation reln, ForkNumber forknum) -{ - for (int i = 0; i < used_pages; i++) - { - if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode) - && forknum == page_tag[i].forkNum) - { - return true; - } - } - return false; -} - -/* - * inmem_create() -- Create a new relation on zenithd storage - * - * If isRedo is true, it's okay for the relation to exist already. - */ -void -inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) -{ -} - -/* - * inmem_unlink() -- Unlink a relation. - */ -void -inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) -{ -} - -/* - * inmem_extend() -- Add a block to the specified relation. - * - * The semantics are nearly the same as mdwrite(): write at the - * specified position. However, this is to be used for the case of - * extending a relation (i.e., blocknum is at or beyond the current - * EOF). Note that we assume writing a block beyond current EOF - * causes intervening file space to become filled with zeroes. - */ -void -inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, - char *buffer, bool skipFsync) -{ - /* same as smgwrite() for us */ - inmem_write(reln, forknum, blkno, buffer, skipFsync); -} - -/* - * inmem_open() -- Initialize newly-opened relation. - */ -void -inmem_open(SMgrRelation reln) -{ -} - -/* - * inmem_close() -- Close the specified relation, if it isn't closed already. - */ -void -inmem_close(SMgrRelation reln, ForkNumber forknum) -{ -} - -/* - * inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation - */ -bool -inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) -{ - return true; -} - -/* - * inmem_writeback() -- Tell the kernel to write pages back to storage. - */ -void -inmem_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) -{ -} - -/* - * inmem_read() -- Read the specified block from a relation. - */ -void -inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, - char *buffer) -{ - int pg; - - pg = locate_page(reln, forknum, blkno); - if (pg < 0) - memset(buffer, 0, BLCKSZ); - else - memcpy(buffer, page_body[pg], BLCKSZ); -} - -/* - * inmem_write() -- Write the supplied block at the appropriate location. - * - * This is to be used only for updating already-existing blocks of a - * relation (ie, those before the current EOF). To extend a relation, - * use mdextend(). - */ -void -inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) -{ - int pg; - - pg = locate_page(reln, forknum, blocknum); - if (pg < 0) - { - /* - * We assume the buffer cache is large enough to hold all the buffers - * needed for most operations. Overflowing to this "in-mem smgr" in rare - * cases is OK. But if we find that we're using more than WARN_PAGES, - * print a warning so that we get alerted and get to investigate why - * we're accessing so many buffers. - */ - elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1, - "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, - blocknum, - used_pages); - if (used_pages == MAX_PAGES) - elog(ERROR, "Inmem storage overflow"); - - pg = used_pages; - used_pages++; - INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum); - } else { - elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, - blocknum, - used_pages); - } - memcpy(page_body[pg], buffer, BLCKSZ); -} - -/* - * inmem_nblocks() -- Get the number of blocks stored in a relation. - */ -BlockNumber -inmem_nblocks(SMgrRelation reln, ForkNumber forknum) -{ - /* - * It's not clear why a WAL redo function would call smgrnblocks(). - * During recovery, at least before reaching consistency, the size of a - * relation could be arbitrarily small, if it was truncated after the - * record being replayed, or arbitrarily large if it was extended - * afterwards. But one place where it's called is in - * XLogReadBufferExtended(): it extends the relation, if it's smaller than - * the requested page. That's a waste of time in the WAL redo - * process. Pretend that all relations are maximally sized to avoid it. - */ - return MaxBlockNumber; -} - -/* - * inmem_truncate() -- Truncate relation to specified number of blocks. - */ -void -inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) -{ -} - -/* - * inmem_immedsync() -- Immediately sync a relation to stable storage. - */ -void -inmem_immedsync(SMgrRelation reln, ForkNumber forknum) -{ -} - -static const struct f_smgr inmem_smgr = -{ - .smgr_init = inmem_init, - .smgr_shutdown = NULL, - .smgr_open = inmem_open, - .smgr_close = inmem_close, - .smgr_create = inmem_create, - .smgr_exists = inmem_exists, - .smgr_unlink = inmem_unlink, - .smgr_extend = inmem_extend, - .smgr_prefetch = inmem_prefetch, - .smgr_read = inmem_read, - .smgr_write = inmem_write, - .smgr_writeback = inmem_writeback, - .smgr_nblocks = inmem_nblocks, - .smgr_truncate = inmem_truncate, - .smgr_immedsync = inmem_immedsync, -}; - -const f_smgr * -smgr_inmem(BackendId backend, RelFileNode rnode) -{ - Assert(InRecovery); - if (backend != InvalidBackendId) - return smgr_standard(backend, rnode); - else - return &inmem_smgr; -} - -void -smgr_init_inmem() -{ - inmem_init(); -} diff --git a/contrib/neon/libpagestore.c b/contrib/neon/libpagestore.c deleted file mode 100644 index 2621421532a..00000000000 --- a/contrib/neon/libpagestore.c +++ /dev/null @@ -1,440 +0,0 @@ -/*------------------------------------------------------------------------- - * - * libpagestore.c - * Handles network communications with the remote pagestore. - * - * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * contrib/neon/libpqpagestore.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "pagestore_client.h" -#include "fmgr.h" -#include "access/xlog.h" - -#include "libpq-fe.h" -#include "libpq/pqformat.h" -#include "libpq/libpq.h" - -#include "miscadmin.h" -#include "pgstat.h" -#include "utils/guc.h" - -#include "replication/walproposer.h" - -PG_MODULE_MAGIC; - -void _PG_init(void); - -#define PageStoreTrace DEBUG5 - -#define NEON_TAG "[NEON_SMGR] " -#define neon_log(tag, fmt, ...) ereport(tag, \ - (errmsg(NEON_TAG fmt, ## __VA_ARGS__), \ - errhidestmt(true), errhidecontext(true))) - -bool connected = false; -PGconn *pageserver_conn = NULL; - -char *page_server_connstring_raw; - -static ZenithResponse *pageserver_call(ZenithRequest *request); -page_server_api api = { - .request = pageserver_call -}; - -static void -pageserver_connect() -{ - char *query; - int ret; - - Assert(!connected); - - pageserver_conn = PQconnectdb(page_server_connstring); - - if (PQstatus(pageserver_conn) == CONNECTION_BAD) - { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); - - PQfinish(pageserver_conn); - pageserver_conn = NULL; - ereport(ERROR, - (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - errmsg(NEON_TAG "could not establish connection to pageserver"), - errdetail_internal("%s", msg))); - } - - query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline); - ret = PQsendQuery(pageserver_conn, query); - if (ret != 1) - { - PQfinish(pageserver_conn); - pageserver_conn = NULL; - neon_log(ERROR, "could not send pagestream command to pageserver"); - } - - while (PQisBusy(pageserver_conn)) - { - int wc; - - /* Sleep until there's something to do */ - wc = WaitLatchOrSocket(MyLatch, - WL_LATCH_SET | WL_SOCKET_READABLE | - WL_EXIT_ON_PM_DEATH, - PQsocket(pageserver_conn), - -1L, PG_WAIT_EXTENSION); - ResetLatch(MyLatch); - - CHECK_FOR_INTERRUPTS(); - - /* Data available in socket? */ - if (wc & WL_SOCKET_READABLE) - { - if (!PQconsumeInput(pageserver_conn)) - { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); - - PQfinish(pageserver_conn); - pageserver_conn = NULL; - - neon_log(ERROR, "could not complete handshake with pageserver: %s", - msg); - } - } - } - - neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw); - - connected = true; -} - -/* - * A wrapper around PQgetCopyData that checks for interrupts while sleeping. - */ -static int -call_PQgetCopyData(PGconn *conn, char **buffer) -{ - int ret; - -retry: - ret = PQgetCopyData(conn, buffer, 1 /* async */ ); - - if (ret == 0) - { - int wc; - - /* Sleep until there's something to do */ - wc = WaitLatchOrSocket(MyLatch, - WL_LATCH_SET | WL_SOCKET_READABLE | - WL_EXIT_ON_PM_DEATH, - PQsocket(conn), - -1L, PG_WAIT_EXTENSION); - ResetLatch(MyLatch); - - CHECK_FOR_INTERRUPTS(); - - /* Data available in socket? */ - if (wc & WL_SOCKET_READABLE) - { - if (!PQconsumeInput(conn)) - neon_log(ERROR, "could not get response from pageserver: %s", - PQerrorMessage(conn)); - } - - goto retry; - } - - return ret; -} - - -static ZenithResponse * -pageserver_call(ZenithRequest *request) -{ - StringInfoData req_buff; - StringInfoData resp_buff; - ZenithResponse *resp; - - PG_TRY(); - { - /* If the connection was lost for some reason, reconnect */ - if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) - { - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } - - if (!connected) - pageserver_connect(); - - req_buff = zm_pack_request(request); - - /* - * Send request. - * - * In principle, this could block if the output buffer is full, and we - * should use async mode and check for interrupts while waiting. In - * practice, our requests are small enough to always fit in the output - * and TCP buffer. - */ - if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn)) - { - neon_log(ERROR, "failed to send page request: %s", - PQerrorMessage(pageserver_conn)); - } - pfree(req_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = zm_to_string((ZenithMessage *) request); - - neon_log(PageStoreTrace, "sent request: %s", msg); - pfree(msg); - } - - /* read response */ - resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data); - resp_buff.cursor = 0; - - if (resp_buff.len == -1) - neon_log(ERROR, "end of COPY"); - else if (resp_buff.len == -2) - neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn)); - - resp = zm_unpack_response(&resp_buff); - PQfreemem(resp_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = zm_to_string((ZenithMessage *) resp); - - neon_log(PageStoreTrace, "got response: %s", msg); - pfree(msg); - } - } - PG_CATCH(); - { - /* - * If anything goes wrong while we were sending a request, it's not - * clear what state the connection is in. For example, if we sent the - * request but didn't receive a response yet, we might receive the - * response some time later after we have already sent a new unrelated - * request. Close the connection to avoid getting confused. - */ - if (connected) - { - neon_log(LOG, "dropping connection to page server due to error"); - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; - } - PG_RE_THROW(); - } - PG_END_TRY(); - - return (ZenithResponse *) resp; -} - - -static bool -check_zenith_id(char **newval, void **extra, GucSource source) -{ - uint8 zid[16]; - - return **newval == '\0' || HexDecodeString(zid, *newval, 16); -} - -static char * -substitute_pageserver_password(const char *page_server_connstring_raw) -{ - char *host = NULL; - char *port = NULL; - char *user = NULL; - char *auth_token = NULL; - char *err = NULL; - char *page_server_connstring = NULL; - PQconninfoOption *conn_options; - PQconninfoOption *conn_option; - MemoryContext oldcontext; - - /* - * Here we substitute password in connection string with an environment - * variable. To simplify things we construct a connection string back with - * only known options. In particular: host port user and password. We do - * not currently use other options and constructing full connstring in an - * URI shape is quite messy. - */ - - if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0') - return NULL; - - /* extract the auth token from the connection string */ - conn_options = PQconninfoParse(page_server_connstring_raw, &err); - if (conn_options == NULL) - { - /* The error string is malloc'd, so we must free it explicitly */ - char *errcopy = err ? pstrdup(err) : "out of memory"; - - PQfreemem(err); - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("invalid connection string syntax: %s", errcopy))); - } - - /* - * Trying to populate pageserver connection string with auth token from - * environment. We are looking for password in with placeholder value like - * $ENV_VAR_NAME, so if password field is present and starts with $ we try - * to fetch environment variable value and fail loudly if it is not set. - */ - for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++) - { - if (strcmp(conn_option->keyword, "host") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - host = conn_option->val; - } - else if (strcmp(conn_option->keyword, "port") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - port = conn_option->val; - } - else if (strcmp(conn_option->keyword, "user") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - user = conn_option->val; - } - else if (strcmp(conn_option->keyword, "password") == 0) - { - if (conn_option->val != NULL && conn_option->val[0] != '\0') - { - /* ensure that this is a template */ - if (strncmp(conn_option->val, "$", 1) != 0) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1]))); - - neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]); - auth_token = getenv(&conn_option->val[1]); - if (!auth_token) - { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_EXCEPTION), - errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1]))); - } - else - { - neon_log(LOG, "using auth token from environment passed via env"); - } - } - } - } - - /* - * allocate connection string in TopMemoryContext to make sure it is not - * freed - */ - oldcontext = CurrentMemoryContext; - MemoryContextSwitchTo(TopMemoryContext); - page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port); - MemoryContextSwitchTo(oldcontext); - - PQconninfoFree(conn_options); - return page_server_connstring; -} - -/* - * Module initialization function - */ -void -_PG_init(void) -{ - DefineCustomStringVariable("neon.pageserver_connstring", - "connection string to the page server", - NULL, - &page_server_connstring_raw, - "", - PGC_POSTMASTER, - 0, /* no flags required */ - NULL, NULL, NULL); - - DefineCustomStringVariable("neon.timeline_id", - "Zenith timelineid the server is running on", - NULL, - &zenith_timeline, - "", - PGC_POSTMASTER, - 0, /* no flags required */ - check_zenith_id, NULL, NULL); - - DefineCustomStringVariable("neon.tenant_id", - "Neon tenantid the server is running on", - NULL, - &zenith_tenant, - "", - PGC_POSTMASTER, - 0, /* no flags required */ - check_zenith_id, NULL, NULL); - - DefineCustomBoolVariable("neon.wal_redo", - "start in wal-redo mode", - NULL, - &wal_redo, - false, - PGC_POSTMASTER, - 0, - NULL, NULL, NULL); - - DefineCustomIntVariable("neon.max_cluster_size", - "cluster size limit", - NULL, - &max_cluster_size, - -1, -1, INT_MAX, - PGC_SIGHUP, - GUC_UNIT_MB, - NULL, NULL, NULL); - - relsize_hash_init(); - EmitWarningsOnPlaceholders("neon"); - - if (page_server != NULL) - neon_log(ERROR, "libpagestore already loaded"); - - neon_log(PageStoreTrace, "libpagestore already loaded"); - page_server = &api; - - /* substitute password in pageserver_connstring */ - page_server_connstring = substitute_pageserver_password(page_server_connstring_raw); - - /* Is there more correct way to pass CustomGUC to postgres code? */ - zenith_timeline_walproposer = zenith_timeline; - zenith_tenant_walproposer = zenith_tenant; - - /* - * Walproposer instructs safekeeper which pageserver to use for - * replication - */ - zenith_pageserver_connstring_walproposer = page_server_connstring; - - if (wal_redo) - { - neon_log(PageStoreTrace, "set inmem_smgr hook"); - smgr_hook = smgr_inmem; - smgr_init_hook = smgr_init_inmem; - } - else if (page_server_connstring && page_server_connstring[0]) - { - neon_log(PageStoreTrace, "set neon_smgr hook"); - smgr_hook = smgr_zenith; - smgr_init_hook = smgr_init_zenith; - dbsize_hook = zenith_dbsize; - } -} diff --git a/contrib/neon/neon--1.0.sql b/contrib/neon/neon--1.0.sql deleted file mode 100644 index 34f1ba78d4f..00000000000 --- a/contrib/neon/neon--1.0.sql +++ /dev/null @@ -1,17 +0,0 @@ -\echo Use "CREATE EXTENSION neon" to load this file. \quit - -CREATE FUNCTION pg_cluster_size() -RETURNS bigint -AS 'MODULE_PATHNAME', 'pg_cluster_size' -LANGUAGE C STRICT -PARALLEL UNSAFE; - -CREATE FUNCTION backpressure_lsns( - OUT received_lsn pg_lsn, - OUT disk_consistent_lsn pg_lsn, - OUT remote_consistent_lsn pg_lsn -) -RETURNS record -AS 'MODULE_PATHNAME', 'backpressure_lsns' -LANGUAGE C STRICT -PARALLEL UNSAFE; diff --git a/contrib/neon/neon.c b/contrib/neon/neon.c deleted file mode 100644 index c7c176dba7a..00000000000 --- a/contrib/neon/neon.c +++ /dev/null @@ -1,66 +0,0 @@ -/*------------------------------------------------------------------------- - * - * neon.c - * Utility functions to expose neon specific information to user - * - * IDENTIFICATION - * contrib/neon/neon.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" -#include "fmgr.h" - -#include "access/xact.h" -#include "access/xlog.h" -#include "storage/buf_internals.h" -#include "storage/bufmgr.h" -#include "catalog/pg_type.h" -#include "replication/walsender.h" -#include "replication/walproposer.h" -#include "funcapi.h" -#include "access/htup_details.h" -#include "utils/pg_lsn.h" - -PG_FUNCTION_INFO_V1(pg_cluster_size); -PG_FUNCTION_INFO_V1(backpressure_lsns); - -Datum -pg_cluster_size(PG_FUNCTION_ARGS) -{ - int64 size; - - size = GetZenithCurrentClusterSize(); - - if (size == 0) - PG_RETURN_NULL(); - - PG_RETURN_INT64(size); -} - - -Datum -backpressure_lsns(PG_FUNCTION_ARGS) -{ - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; - Datum values[3]; - bool nulls[3]; - TupleDesc tupdesc; - - replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); - - tupdesc = CreateTemplateTupleDesc(3); - TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0); - TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0); - tupdesc = BlessTupleDesc(tupdesc); - - MemSet(nulls, 0, sizeof(nulls)); - values[0] = LSNGetDatum(writePtr); - values[1] = LSNGetDatum(flushPtr); - values[2] = LSNGetDatum(applyPtr); - - PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); -} diff --git a/contrib/neon/neon.control b/contrib/neon/neon.control deleted file mode 100644 index 84f79881c1e..00000000000 --- a/contrib/neon/neon.control +++ /dev/null @@ -1,4 +0,0 @@ -# neon extension -comment = 'cloud storage for PostgreSQL' -default_version = '1.0' -module_pathname = '$libdir/neon' diff --git a/contrib/neon/pagestore_client.h b/contrib/neon/pagestore_client.h deleted file mode 100644 index 93ea6771eb9..00000000000 --- a/contrib/neon/pagestore_client.h +++ /dev/null @@ -1,221 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pagestore_client.h - * - * - * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * contrib/neon/pagestore_client.h - * - *------------------------------------------------------------------------- - */ -#ifndef pageserver_h -#define pageserver_h - -#include "postgres.h" - -#include "access/xlogdefs.h" -#include "storage/relfilenode.h" -#include "storage/block.h" -#include "storage/smgr.h" -#include "lib/stringinfo.h" -#include "libpq/pqformat.h" -#include "utils/memutils.h" - -#include "pg_config.h" - -typedef enum -{ - /* pagestore_client -> pagestore */ - T_ZenithExistsRequest = 0, - T_ZenithNblocksRequest, - T_ZenithGetPageRequest, - T_ZenithDbSizeRequest, - - /* pagestore -> pagestore_client */ - T_ZenithExistsResponse = 100, - T_ZenithNblocksResponse, - T_ZenithGetPageResponse, - T_ZenithErrorResponse, - T_ZenithDbSizeResponse, -} ZenithMessageTag; - - - -/* base struct for c-style inheritance */ -typedef struct -{ - ZenithMessageTag tag; -} ZenithMessage; - -#define messageTag(m) (((const ZenithMessage *)(m))->tag) - -/* - * supertype of all the Zenith*Request structs below - * - * If 'latest' is true, we are requesting the latest page version, and 'lsn' - * is just a hint to the server that we know there are no versions of the page - * (or relation size, for exists/nblocks requests) later than the 'lsn'. - */ -typedef struct -{ - ZenithMessageTag tag; - bool latest; /* if true, request latest page version */ - XLogRecPtr lsn; /* request page version @ this LSN */ -} ZenithRequest; - -typedef struct -{ - ZenithRequest req; - RelFileNode rnode; - ForkNumber forknum; -} ZenithExistsRequest; - -typedef struct -{ - ZenithRequest req; - RelFileNode rnode; - ForkNumber forknum; -} ZenithNblocksRequest; - - -typedef struct -{ - ZenithRequest req; - Oid dbNode; -} ZenithDbSizeRequest; - - -typedef struct -{ - ZenithRequest req; - RelFileNode rnode; - ForkNumber forknum; - BlockNumber blkno; -} ZenithGetPageRequest; - -/* supertype of all the Zenith*Response structs below */ -typedef struct -{ - ZenithMessageTag tag; -} ZenithResponse; - -typedef struct -{ - ZenithMessageTag tag; - bool exists; -} ZenithExistsResponse; - -typedef struct -{ - ZenithMessageTag tag; - uint32 n_blocks; -} ZenithNblocksResponse; - -typedef struct -{ - ZenithMessageTag tag; - char page[FLEXIBLE_ARRAY_MEMBER]; -} ZenithGetPageResponse; - -typedef struct -{ - ZenithMessageTag tag; - int64 db_size; -} ZenithDbSizeResponse; - -typedef struct -{ - ZenithMessageTag tag; - char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */ -} ZenithErrorResponse; - -extern StringInfoData zm_pack_request(ZenithRequest *msg); -extern ZenithResponse *zm_unpack_response(StringInfo s); -extern char *zm_to_string(ZenithMessage *msg); - -/* - * API - */ - -typedef struct -{ - ZenithResponse *(*request) (ZenithRequest *request); -} page_server_api; - -extern page_server_api *page_server; - -extern char *page_server_connstring; -extern char *zenith_timeline; -extern char *zenith_tenant; -extern bool wal_redo; -extern int32 max_cluster_size; - -extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode); -extern void smgr_init_zenith(void); - -extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode); -extern void smgr_init_inmem(void); -extern void smgr_shutdown_inmem(void); - -/* zenith storage manager functionality */ - -extern void zenith_init(void); -extern void zenith_open(SMgrRelation reln); -extern void zenith_close(SMgrRelation reln, ForkNumber forknum); -extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum); -extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -extern void zenith_extend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); - -extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); - -extern void zenith_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum); -extern int64 zenith_dbsize(Oid dbNode); -extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); -extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum); - -/* zenith wal-redo storage manager functionality */ - -extern void inmem_init(void); -extern void inmem_open(SMgrRelation reln); -extern void inmem_close(SMgrRelation reln, ForkNumber forknum); -extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum); -extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); -extern void inmem_extend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); -extern void inmem_write(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); -extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum); -extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); -extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum); - - -/* utils for zenith relsize cache */ -extern void relsize_hash_init(void); -extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size); -extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); -extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size); -extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum); - -#endif diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c deleted file mode 100644 index 4f5706ed3ee..00000000000 --- a/contrib/neon/pagestore_smgr.c +++ /dev/null @@ -1,1698 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pagestore_smgr.c - * - * - * - * Temporary and unlogged rels - * --------------------------- - * - * Temporary and unlogged tables are stored locally, by md.c. The functions - * here just pass the calls through to corresponding md.c functions. - * - * Index build operations that use the buffer cache are also handled locally, - * just like unlogged tables. Such operations must be marked by calling - * smgr_start_unlogged_build() and friends. - * - * In order to know what relations are permanent and which ones are not, we - * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set - * by smgropen() callers, when they have the relcache entry at hand. However, - * sometimes we need to open an SmgrRelation for a relation without the - * relcache. That is needed when we evict a buffer; we might not have the - * SmgrRelation for that relation open yet. To deal with that, the - * 'relpersistence' can be left to zero, meaning we don't know if it's - * permanent or not. Most operations are not allowed with relpersistence==0, - * but smgrwrite() does work, which is what we need for buffer eviction. and - * smgrunlink() so that a backend doesn't need to have the relcache entry at - * transaction commit, where relations that were dropped in the transaction - * are unlinked. - * - * If smgrwrite() is called and smgr_relpersistence == 0, we check if the - * relation file exists locally or not. If it does exist, we assume it's an - * unlogged relation and write the page there. Otherwise it must be a - * permanent relation, WAL-logged and stored on the page server, and we ignore - * the write like we do for permanent relations. - * - * - * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * contrib/neon/pagestore_smgr.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "access/xact.h" -#include "access/xlog.h" -#include "access/xlogutils.h" -#include "access/xloginsert.h" -#include "access/xlogrecovery.h" -#include "access/xlog_internal.h" -#include "catalog/pg_class.h" -#include "pagestore_client.h" -#include "pagestore_client.h" -#include "storage/smgr.h" -#include "access/xlogdefs.h" -#include "postmaster/interrupt.h" -#include "replication/walsender.h" -#include "storage/bufmgr.h" -#include "storage/md.h" -#include "fmgr.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "catalog/pg_tablespace_d.h" -#include "postmaster/autovacuum.h" - -/* - * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API - * calls to md.c, and *also* do the calls to the Page Server. On every - * read, compare the versions we read from local disk and Page Server, - * and Assert that they are identical. - */ -/* #define DEBUG_COMPARE_LOCAL */ - -#ifdef DEBUG_COMPARE_LOCAL -#include "access/nbtree.h" -#include "storage/bufpage.h" -#include "access/xlog_internal.h" - -static char *hexdump_page(char *page); -#endif - -#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId) - -const int SmgrTrace = DEBUG5; - -page_server_api *page_server; - -/* GUCs */ -char *page_server_connstring; // with substituted password -char *zenith_timeline; -char *zenith_tenant; -bool wal_redo = false; -int32 max_cluster_size; - -/* unlogged relation build states */ -typedef enum -{ - UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, - UNLOGGED_BUILD_PHASE_1, - UNLOGGED_BUILD_PHASE_2, - UNLOGGED_BUILD_NOT_PERMANENT -} UnloggedBuildPhase; - -static SMgrRelation unlogged_build_rel = NULL; -static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; - -StringInfoData -zm_pack_request(ZenithRequest *msg) -{ - StringInfoData s; - - initStringInfo(&s); - pq_sendbyte(&s, msg->tag); - - switch (messageTag(msg)) - { - /* pagestore_client -> pagestore */ - case T_ZenithExistsRequest: - { - ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; - - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); - pq_sendbyte(&s, msg_req->forknum); - - break; - } - case T_ZenithNblocksRequest: - { - ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; - - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); - pq_sendbyte(&s, msg_req->forknum); - - break; - } - case T_ZenithDbSizeRequest: - { - ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; - - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->dbNode); - - break; - } - case T_ZenithGetPageRequest: - { - ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; - - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); - pq_sendint32(&s, msg_req->rnode.spcNode); - pq_sendint32(&s, msg_req->rnode.dbNode); - pq_sendint32(&s, msg_req->rnode.relNode); - pq_sendbyte(&s, msg_req->forknum); - pq_sendint32(&s, msg_req->blkno); - - break; - } - - /* pagestore -> pagestore_client. We never need to create these. */ - case T_ZenithExistsResponse: - case T_ZenithNblocksResponse: - case T_ZenithGetPageResponse: - case T_ZenithErrorResponse: - case T_ZenithDbSizeResponse: - default: - elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag); - break; - } - return s; -} - -ZenithResponse * -zm_unpack_response(StringInfo s) -{ - ZenithMessageTag tag = pq_getmsgbyte(s); - ZenithResponse *resp = NULL; - - switch (tag) - { - /* pagestore -> pagestore_client */ - case T_ZenithExistsResponse: - { - ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse)); - - msg_resp->tag = tag; - msg_resp->exists = pq_getmsgbyte(s); - pq_getmsgend(s); - - resp = (ZenithResponse *) msg_resp; - break; - } - - case T_ZenithNblocksResponse: - { - ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse)); - - msg_resp->tag = tag; - msg_resp->n_blocks = pq_getmsgint(s, 4); - pq_getmsgend(s); - - resp = (ZenithResponse *) msg_resp; - break; - } - - case T_ZenithGetPageResponse: - { - ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ); - - msg_resp->tag = tag; - /* XXX: should be varlena */ - memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); - pq_getmsgend(s); - - resp = (ZenithResponse *) msg_resp; - break; - } - - case T_ZenithDbSizeResponse: - { - ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse)); - - msg_resp->tag = tag; - msg_resp->db_size = pq_getmsgint64(s); - pq_getmsgend(s); - - resp = (ZenithResponse *) msg_resp; - break; - } - - case T_ZenithErrorResponse: - { - ZenithErrorResponse *msg_resp; - size_t msglen; - const char *msgtext; - - msgtext = pq_getmsgrawstring(s); - msglen = strlen(msgtext); - - msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1); - msg_resp->tag = tag; - memcpy(msg_resp->message, msgtext, msglen + 1); - pq_getmsgend(s); - - resp = (ZenithResponse *) msg_resp; - break; - } - - /* - * pagestore_client -> pagestore - * - * We create these ourselves, and don't need to decode them. - */ - case T_ZenithExistsRequest: - case T_ZenithNblocksRequest: - case T_ZenithGetPageRequest: - case T_ZenithDbSizeRequest: - default: - elog(ERROR, "unexpected zenith message tag 0x%02x", tag); - break; - } - - return resp; -} - -/* dump to json for debugging / error reporting purposes */ -char * -zm_to_string(ZenithMessage *msg) -{ - StringInfoData s; - - initStringInfo(&s); - - switch (messageTag(msg)) - { - /* pagestore_client -> pagestore */ - case T_ZenithExistsRequest: - { - ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); - appendStringInfoChar(&s, '}'); - break; - } - - case T_ZenithNblocksRequest: - { - ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); - appendStringInfoChar(&s, '}'); - break; - } - - case T_ZenithGetPageRequest: - { - ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\""); - appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"", - msg_req->rnode.spcNode, - msg_req->rnode.dbNode, - msg_req->rnode.relNode); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); - appendStringInfoChar(&s, '}'); - break; - } - case T_ZenithDbSizeRequest: - { - ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\""); - appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); - appendStringInfoChar(&s, '}'); - break; - } - - - /* pagestore -> pagestore_client */ - case T_ZenithExistsResponse: - { - ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\""); - appendStringInfo(&s, ", \"exists\": %d}", - msg_resp->exists - ); - appendStringInfoChar(&s, '}'); - - break; - } - case T_ZenithNblocksResponse: - { - ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\""); - appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks - ); - appendStringInfoChar(&s, '}'); - - break; - } - case T_ZenithGetPageResponse: - { -#if 0 - ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg; -#endif - - appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\""); - appendStringInfo(&s, ", \"page\": \"XXX\"}"); - appendStringInfoChar(&s, '}'); - break; - } - case T_ZenithErrorResponse: - { - ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg; - - /* FIXME: escape double-quotes in the message */ - appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\""); - appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); - appendStringInfoChar(&s, '}'); - break; - } - case T_ZenithDbSizeResponse: - { - ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\""); - appendStringInfo(&s, ", \"db_size\": %ld}", - msg_resp->db_size - ); - appendStringInfoChar(&s, '}'); - - break; - } - - default: - appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); - } - return s.data; -} - -/* - * Wrapper around log_newpage() that makes a temporary copy of the block and - * WAL-logs that. This makes it safe to use while holding only a shared lock - * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint - * directly because it skips the logging if the LSN is new enough. - */ -static XLogRecPtr -log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno, - Page page, bool page_std) -{ - PGAlignedBlock copied_buffer; - - memcpy(copied_buffer.data, page, BLCKSZ); - return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std); -} - -/* - * Is 'buffer' identical to a freshly initialized empty heap page? - */ -static bool -PageIsEmptyHeapPage(char *buffer) -{ - PGAlignedBlock empty_page; - - PageInit((Page) empty_page.data, BLCKSZ, 0); - - return memcmp(buffer, empty_page.data, BLCKSZ) == 0; -} - -static void -zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) -{ - XLogRecPtr lsn = PageGetLSN(buffer); - - if (ShutdownRequestPending) - return; - - /* - * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM - * changes are not WAL-logged when the changes are made, so this is our - * last chance to log them, otherwise they're lost. That's OK for - * correctness, the non-logged updates are not critical. But we want to - * have a reasonably up-to-date VM and FSM in the page server. - */ - if (forknum == FSM_FORKNUM && !RecoveryInProgress()) - { - /* FSM is never WAL-logged and we don't care. */ - XLogRecPtr recptr; - - recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); - XLogFlush(recptr); - lsn = recptr; - ereport(SmgrTrace, - (errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", - blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, LSN_FORMAT_ARGS(lsn)))); - } - else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress()) - { - /* - * Always WAL-log vm. We should never miss clearing visibility map - * bits. - * - * TODO Is it too bad for performance? Hopefully we do not evict - * actively used vm too often. - */ - XLogRecPtr recptr; - - recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false); - XLogFlush(recptr); - lsn = recptr; - - ereport(SmgrTrace, - (errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X", - blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, LSN_FORMAT_ARGS(lsn)))); - } - else if (lsn == InvalidXLogRecPtr) - { - /* - * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages, - * and we can just ignore that in Zenith. We do need to remember the new size, - * though, so that smgrnblocks() returns the right answer after the rel has - * been extended. We rely on the relsize cache for that. - * - * A completely empty heap page doesn't need to be WAL-logged, either. The - * heapam can leave such a page behind, if e.g. an insert errors out after - * initializing the page, but before it has inserted the tuple and WAL-logged - * the change. When we read the page from the page server, it will come back - * as all-zeros. That's OK, the heapam will initialize an all-zeros page on - * first use. - * - * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies - * that the page was not WAL-logged, and its contents will be lost when it's - * evicted. - */ - if (PageIsNew(buffer)) - { - ereport(SmgrTrace, - (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", - blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum))); - } - else if (PageIsEmptyHeapPage(buffer)) - { - ereport(SmgrTrace, - (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", - blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum))); - } - else - { - ereport(PANIC, - (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", - blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum))); - } - } - else - { - ereport(SmgrTrace, - (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", - blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, LSN_FORMAT_ARGS(lsn)))); - } - - /* - * Remember the LSN on this page. When we read the page again, we must - * read the same or newer version of it. - */ - SetLastWrittenPageLSN(lsn); -} - - -/* - * zenith_init() -- Initialize private state - */ -void -zenith_init(void) -{ - /* noop */ -#ifdef DEBUG_COMPARE_LOCAL - mdinit(); -#endif -} - -/* - * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position - * to physical position in WAL. It always adds SizeOfXLogShortPHD: - * seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; - * so even if there are no records on the page, offset will be SizeOfXLogShortPHD. - * It may cause problems with XLogFlush. So return pointer backward to the origin of the page. - */ -static XLogRecPtr -zm_adjust_lsn(XLogRecPtr lsn) -{ - /* - * If lsn points to the beging of first record on page or segment, then - * "return" it back to the page origin - */ - if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD) - { - lsn -= SizeOfXLogShortPHD; - } - else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD) - { - lsn -= SizeOfXLogLongPHD; - } - return lsn; -} - -/* - * Return LSN for requesting pages and number of blocks from page server - */ -static XLogRecPtr -zenith_get_request_lsn(bool *latest) -{ - XLogRecPtr lsn; - - if (RecoveryInProgress()) - { - *latest = false; - lsn = GetXLogReplayRecPtr(NULL); - elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); - } - else if (am_walsender) - { - *latest = true; - lsn = InvalidXLogRecPtr; - elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 "); - } - else - { - XLogRecPtr flushlsn; - - /* - * Use the latest LSN that was evicted from the buffer cache. Any - * pages modified by later WAL records must still in the buffer cache, - * so our request cannot concern those. - */ - *latest = true; - lsn = GetLastWrittenPageLSN(); - Assert(lsn != InvalidXLogRecPtr); - elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); - - lsn = zm_adjust_lsn(lsn); - - /* - * Is it possible that the last-written LSN is ahead of last flush - * LSN? Generally not, we shouldn't evict a page from the buffer cache - * before all its modifications have been safely flushed. That's the - * "WAL before data" rule. However, such case does exist at index building, - * _bt_blwritepage logs the full page without flushing WAL before - * smgrextend (files are fsynced before build ends). - */ - flushlsn = GetFlushRecPtr(NULL); - if (lsn > flushlsn) - { - elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - (uint32) (lsn >> 32), (uint32) lsn, - (uint32) (flushlsn >> 32), (uint32) flushlsn); - XLogFlush(lsn); - } - } - - return lsn; -} - - -/* - * zenith_exists() -- Does the physical file exist? - */ -bool -zenith_exists(SMgrRelation reln, ForkNumber forkNum) -{ - bool exists; - ZenithResponse *resp; - BlockNumber n_blocks; - bool latest; - XLogRecPtr request_lsn; - - switch (reln->smgr_relpersistence) - { - case 0: - /* - * We don't know if it's an unlogged rel stored locally, or permanent - * rel stored in the page server. First check if it exists locally. - * If it does, great. Otherwise check if it exists in the page server. - */ - if (mdexists(reln, forkNum)) - return true; - break; - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - return mdexists(reln, forkNum); - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks)) - { - return true; - } - - /* - * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server - * will error out if you check that, because the whole dbdir for tablespace - * 0, db 0 doesn't exists. We possibly should change the page server to - * accept that and return 'false', to be consistent with mdexists(). But - * we probably also should fix pg_table_size() to not call smgrexists() - * with bogus relfilenode. - * - * For now, handle that special case here. - */ - if (reln->smgr_rnode.node.spcNode == 0 && - reln->smgr_rnode.node.dbNode == 0 && - reln->smgr_rnode.node.relNode == 0) - { - return false; - } - - request_lsn = zenith_get_request_lsn(&latest); - { - ZenithExistsRequest request = { - .req.tag = T_ZenithExistsRequest, - .req.latest = latest, - .req.lsn = request_lsn, - .rnode = reln->smgr_rnode.node, - .forknum = forkNum - }; - - resp = page_server->request((ZenithRequest *) &request); - } - - switch (resp->tag) - { - case T_ZenithExistsResponse: - exists = ((ZenithExistsResponse *) resp)->exists; - break; - - case T_ZenithErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), - errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); - break; - - default: - elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); - } - pfree(resp); - return exists; -} - -/* - * zenith_create() -- Create a new relation on zenithd storage - * - * If isRedo is true, it's okay for the relation to exist already. - */ -void -zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) -{ - switch (reln->smgr_relpersistence) - { - case 0: - elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdcreate(reln, forkNum, isRedo); - return; - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - elog(SmgrTrace, "Create relation %u/%u/%u.%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forkNum); - - /* - * Newly created relation is empty, remember that in the relsize cache. - * - * FIXME: This is currently not just an optimization, but required for - * correctness. Postgres can call smgrnblocks() on the newly-created - * relation. Currently, we don't call SetLastWrittenPageLSN() when a new - * relation created, so if we didn't remember the size in the relsize - * cache, we might call smgrnblocks() on the newly-created relation before - * the creation WAL record hass been received by the page server. - */ - set_cached_relsize(reln->smgr_rnode.node, forkNum, 0); - -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdcreate(reln, forkNum, isRedo); -#endif -} - -/* - * zenith_unlink() -- Unlink a relation. - * - * Note that we're passed a RelFileNodeBackend --- by the time this is called, - * there won't be an SMgrRelation hashtable entry anymore. - * - * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber - * to delete all forks. - * - * - * If isRedo is true, it's unsurprising for the relation to be already gone. - * Also, we should remove the file immediately instead of queuing a request - * for later, since during redo there's no possibility of creating a - * conflicting relation. - * - * Note: any failure should be reported as WARNING not ERROR, because - * we are usually not in a transaction anymore when this is called. - */ -void -zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) -{ - /* - * Might or might not exist locally, depending on whether it's - * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is - * set). Try to unlink, it won't do any harm if the file doesn't - * exist. - */ - mdunlink(rnode, forkNum, isRedo); - if (!RelFileNodeBackendIsTemp(rnode)) { - forget_cached_relsize(rnode.node, forkNum); - } -} - -/* - * zenith_extend() -- Add a block to the specified relation. - * - * The semantics are nearly the same as mdwrite(): write at the - * specified position. However, this is to be used for the case of - * extending a relation (i.e., blocknum is at or beyond the current - * EOF). Note that we assume writing a block beyond current EOF - * causes intervening file space to become filled with zeroes. - */ -void -zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, - char *buffer, bool skipFsync) -{ - XLogRecPtr lsn; - - switch (reln->smgr_relpersistence) - { - case 0: - elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdextend(reln, forkNum, blkno, buffer, skipFsync); - return; - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - /* - * Check that the cluster size limit has not been exceeded. - * - * Temporary and unlogged relations are not included in the cluster size measured - * by the page server, so ignore those. Autovacuum processes are also exempt. - */ - if (max_cluster_size > 0 && - reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && - !IsAutoVacuumWorkerProcess()) - { - uint64 current_size = GetZenithCurrentClusterSize(); - - if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) - ereport(ERROR, - (errcode(ERRCODE_DISK_FULL), - errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", - max_cluster_size), - errhint("This limit is defined by neon.max_cluster_size GUC"))); - } - - zenith_wallog_page(reln, forkNum, blkno, buffer); - set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1); - - lsn = PageGetLSN(buffer); - elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forkNum, blkno, - (uint32) (lsn >> 32), (uint32) lsn); - -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdextend(reln, forkNum, blkno, buffer, skipFsync); -#endif -} - -/* - * zenith_open() -- Initialize newly-opened relation. - */ -void -zenith_open(SMgrRelation reln) -{ - /* - * We don't have anything special to do here. Call mdopen() to let md.c - * initialize itself. That's only needed for temporary or unlogged - * relations, but it's dirt cheap so do it always to make sure the md - * fields are initialized, for debugging purposes if nothing else. - */ - mdopen(reln); - - /* no work */ - elog(SmgrTrace, "[ZENITH_SMGR] open noop"); -} - -/* - * zenith_close() -- Close the specified relation, if it isn't closed already. - */ -void -zenith_close(SMgrRelation reln, ForkNumber forknum) -{ - /* - * Let md.c close it, if it had it open. Doesn't hurt to do this - * even for permanent relations that have no local storage. - */ - mdclose(reln, forknum); -} - -/* - * zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation - */ -bool -zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) -{ - switch (reln->smgr_relpersistence) - { - case 0: - /* probably shouldn't happen, but ignore it */ - break; - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - return mdprefetch(reln, forknum, blocknum); - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - /* not implemented */ - elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop"); - return true; -} - -/* - * zenith_writeback() -- Tell the kernel to write pages back to storage. - * - * This accepts a range of blocks because flushing several pages at once is - * considerably more efficient than doing so individually. - */ -void -zenith_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) -{ - switch (reln->smgr_relpersistence) - { - case 0: - /* mdwriteback() does nothing if the file doesn't exist */ - mdwriteback(reln, forknum, blocknum, nblocks); - break; - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdwriteback(reln, forknum, blocknum, nblocks); - return; - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - /* not implemented */ - elog(SmgrTrace, "[ZENITH_SMGR] writeback noop"); - -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdwriteback(reln, forknum, blocknum, nblocks); -#endif -} - -/* - * While function is defined in the zenith extension it's used within neon_test_utils directly. - * To avoid breaking tests in the runtime please keep function signature in sync. - */ -void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) -{ - ZenithResponse *resp; - - { - ZenithGetPageRequest request = { - .req.tag = T_ZenithGetPageRequest, - .req.latest = request_latest, - .req.lsn = request_lsn, - .rnode = rnode, - .forknum = forkNum, - .blkno = blkno - }; - - resp = page_server->request((ZenithRequest *) &request); - } - - switch (resp->tag) - { - case T_ZenithGetPageResponse: - memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ); - break; - - case T_ZenithErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - blkno, - rnode.spcNode, - rnode.dbNode, - rnode.relNode, - forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), - errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); - break; - - default: - elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); - } - - pfree(resp); -} - -/* - * zenith_read() -- Read the specified block from a relation. - */ -void -zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, - char *buffer) -{ - bool latest; - XLogRecPtr request_lsn; - - switch (reln->smgr_relpersistence) - { - case 0: - elog(ERROR, "cannot call smgrread() on rel with unknown persistence"); - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdread(reln, forkNum, blkno, buffer); - return; - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - request_lsn = zenith_get_request_lsn(&latest); - zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer); - -#ifdef DEBUG_COMPARE_LOCAL - if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) - { - char pageserver_masked[BLCKSZ]; - char mdbuf[BLCKSZ]; - char mdbuf_masked[BLCKSZ]; - - mdread(reln, forkNum, blkno, mdbuf); - - memcpy(pageserver_masked, buffer, BLCKSZ); - memcpy(mdbuf_masked, mdbuf, BLCKSZ); - - if (PageIsNew(mdbuf)) - { - if (!PageIsNew(pageserver_masked)) - { - elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(buffer)); - } - } - else if (PageIsNew(buffer)) - { - elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf)); - } - else if (PageGetSpecialSize(mdbuf) == 0) - { - /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); - RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) - { - elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), - hexdump_page(pageserver_masked)); - } - } - else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) - { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) - { - /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); - RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) - { - elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), - hexdump_page(pageserver_masked)); - } - } - } - } -#endif -} - -#ifdef DEBUG_COMPARE_LOCAL -static char * -hexdump_page(char *page) -{ - StringInfoData result; - - initStringInfo(&result); - - for (int i = 0; i < BLCKSZ; i++) - { - if (i % 8 == 0) - appendStringInfo(&result, " "); - if (i % 40 == 0) - appendStringInfo(&result, "\n"); - appendStringInfo(&result, "%02x", (unsigned char) (page[i])); - } - - return result.data; -} -#endif - -/* - * zenith_write() -- Write the supplied block at the appropriate location. - * - * This is to be used only for updating already-existing blocks of a - * relation (ie, those before the current EOF). To extend a relation, - * use mdextend(). - */ -void -zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) -{ - XLogRecPtr lsn; - - switch (reln->smgr_relpersistence) - { - case 0: - /* This is a bit tricky. Check if the relation exists locally */ - if (mdexists(reln, forknum)) - { - /* It exists locally. Guess it's unlogged then. */ - mdwrite(reln, forknum, blocknum, buffer, skipFsync); - - /* - * We could set relpersistence now that we have determined - * that it's local. But we don't dare to do it, because that - * would immediately allow reads as well, which shouldn't - * happen. We could cache it with a different 'relpersistence' - * value, but this isn't performance critical. - */ - return; - } - break; - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdwrite(reln, forknum, blocknum, buffer, skipFsync); - return; - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - zenith_wallog_page(reln, forknum, blocknum, buffer); - - lsn = PageGetLSN(buffer); - elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, blocknum, - (uint32) (lsn >> 32), (uint32) lsn); - -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdwrite(reln, forknum, blocknum, buffer, skipFsync); -#endif -} - -/* - * zenith_nblocks() -- Get the number of blocks stored in a relation. - */ -BlockNumber -zenith_nblocks(SMgrRelation reln, ForkNumber forknum) -{ - ZenithResponse *resp; - BlockNumber n_blocks; - bool latest; - XLogRecPtr request_lsn; - - switch (reln->smgr_relpersistence) - { - case 0: - elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); - break; - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - return mdnblocks(reln, forknum); - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks)) - { - elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, n_blocks); - return n_blocks; - } - - request_lsn = zenith_get_request_lsn(&latest); - { - ZenithNblocksRequest request = { - .req.tag = T_ZenithNblocksRequest, - .req.latest = latest, - .req.lsn = request_lsn, - .rnode = reln->smgr_rnode.node, - .forknum = forknum, - }; - - resp = page_server->request((ZenithRequest *) &request); - } - - switch (resp->tag) - { - case T_ZenithNblocksResponse: - n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks; - break; - - case T_ZenithErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), - errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); - break; - - default: - elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); - } - update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks); - - elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - n_blocks); - - pfree(resp); - return n_blocks; -} - -/* - * zenith_db_size() -- Get the size of the database in bytes. - */ -int64 -zenith_dbsize(Oid dbNode) -{ - ZenithResponse *resp; - int64 db_size; - XLogRecPtr request_lsn; - bool latest; - - request_lsn = zenith_get_request_lsn(&latest); - { - ZenithDbSizeRequest request = { - .req.tag = T_ZenithDbSizeRequest, - .req.latest = latest, - .req.lsn = request_lsn, - .dbNode = dbNode, - }; - - resp = page_server->request((ZenithRequest *) &request); - } - - switch (resp->tag) - { - case T_ZenithDbSizeResponse: - db_size = ((ZenithDbSizeResponse *) resp)->db_size; - break; - - case T_ZenithErrorResponse: - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg("could not read db size of db %u from page server at lsn %X/%08X", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn), - errdetail("page server returned error: %s", - ((ZenithErrorResponse *) resp)->message))); - break; - - default: - elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); - } - - elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - db_size); - - pfree(resp); - return db_size; -} - -/* - * zenith_truncate() -- Truncate relation to specified number of blocks. - */ -void -zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) -{ - XLogRecPtr lsn; - - switch (reln->smgr_relpersistence) - { - case 0: - elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); - break; - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdtruncate(reln, forknum, nblocks); - return; - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks); - - /* - * Truncating a relation drops all its buffers from the buffer cache - * without calling smgrwrite() on them. But we must account for that in - * our tracking of last-written-LSN all the same: any future smgrnblocks() - * request must return the new size after the truncation. We don't know - * what the LSN of the truncation record was, so be conservative and use - * the most recently inserted WAL record's LSN. - */ - lsn = GetXLogInsertRecPtr(); - - lsn = zm_adjust_lsn(lsn); - - /* - * Flush it, too. We don't actually care about it here, but let's uphold - * the invariant that last-written LSN <= flush LSN. - */ - XLogFlush(lsn); - - SetLastWrittenPageLSN(lsn); - -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdtruncate(reln, forknum, nblocks); -#endif -} - -/* - * zenith_immedsync() -- Immediately sync a relation to stable storage. - * - * Note that only writes already issued are synced; this routine knows - * nothing of dirty buffers that may exist inside the buffer manager. We - * sync active and inactive segments; smgrDoPendingSyncs() relies on this. - * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of - * some segment, then mdtruncate() renders that segment inactive. If we - * crash before the next checkpoint syncs the newly-inactive segment, that - * segment may survive recovery, reintroducing unwanted data into the table. - */ -void -zenith_immedsync(SMgrRelation reln, ForkNumber forknum) -{ - switch (reln->smgr_relpersistence) - { - case 0: - elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); - break; - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdimmedsync(reln, forknum); - return; - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop"); - -#ifdef DEBUG_COMPARE_LOCAL - if (IS_LOCAL_REL(reln)) - mdimmedsync(reln, forknum); -#endif -} - -/* - * zenith_start_unlogged_build() -- Starting build operation on a rel. - * - * Some indexes are built in two phases, by first populating the table with - * regular inserts, using the shared buffer cache but skipping WAL-logging, - * and WAL-logging the whole relation after it's done. Zenith relies on the - * WAL to reconstruct pages, so we cannot use the page server in the - * first phase when the changes are not logged. - */ -static void -zenith_start_unlogged_build(SMgrRelation reln) -{ - /* - * Currently, there can be only one unlogged relation build operation in - * progress at a time. That's enough for the current usage. - */ - if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) - elog(ERROR, "unlogged relation build is already in progress"); - Assert(unlogged_build_rel == NULL); - - ereport(SmgrTrace, - (errmsg("starting unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); - - switch (reln->smgr_relpersistence) - { - case 0: - elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); - break; - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - unlogged_build_rel = reln; - unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; - return; - - default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - if (smgrnblocks(reln, MAIN_FORKNUM) != 0) - elog(ERROR, "cannot perform unlogged index build, index is not empty "); - - unlogged_build_rel = reln; - unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; - - /* Make the relation look like it's unlogged */ - reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; - - /* - * FIXME: should we pass isRedo true to create the tablespace dir if it - * doesn't exist? Is it needed? - */ - mdcreate(reln, MAIN_FORKNUM, false); -} - -/* - * zenith_finish_unlogged_build_phase_1() - * - * Call this after you have finished populating a relation in unlogged mode, - * before you start WAL-logging it. - */ -static void -zenith_finish_unlogged_build_phase_1(SMgrRelation reln) -{ - Assert(unlogged_build_rel == reln); - - ereport(SmgrTrace, - (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); - - if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) - return; - - Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); - - unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; -} - -/* - * zenith_end_unlogged_build() -- Finish an unlogged rel build. - * - * Call this after you have finished WAL-logging an relation that was - * first populated without WAL-logging. - * - * This removes the local copy of the rel, since it's now been fully - * WAL-logged and is present in the page server. - */ -static void -zenith_end_unlogged_build(SMgrRelation reln) -{ - Assert(unlogged_build_rel == reln); - - ereport(SmgrTrace, - (errmsg("ending unlogged build of relation %u/%u/%u", - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode))); - - if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) - { - RelFileNodeBackend rnode; - - Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); - - /* Make the relation look permanent again */ - reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; - - /* Remove local copy */ - rnode = reln->smgr_rnode; - for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", - rnode.node.spcNode, - rnode.node.dbNode, - rnode.node.relNode, - forknum); - - forget_cached_relsize(rnode.node, forknum); - mdclose(reln, forknum); - /* use isRedo == true, so that we drop it immediately */ - mdunlink(rnode, forknum, true); - } - } - - unlogged_build_rel = NULL; - unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; -} - -static void -AtEOXact_zenith(XactEvent event, void *arg) -{ - switch (event) - { - case XACT_EVENT_ABORT: - case XACT_EVENT_PARALLEL_ABORT: - - /* - * Forget about any build we might have had in progress. The local - * file will be unlinked by smgrDoPendingDeletes() - */ - unlogged_build_rel = NULL; - unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; - break; - - case XACT_EVENT_COMMIT: - case XACT_EVENT_PARALLEL_COMMIT: - case XACT_EVENT_PREPARE: - case XACT_EVENT_PRE_COMMIT: - case XACT_EVENT_PARALLEL_PRE_COMMIT: - case XACT_EVENT_PRE_PREPARE: - if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) - { - unlogged_build_rel = NULL; - unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; - ereport(ERROR, - (errcode(ERRCODE_INTERNAL_ERROR), - (errmsg("unlogged index build was not properly finished")))); - } - break; - } -} - -static const struct f_smgr zenith_smgr = -{ - .smgr_init = zenith_init, - .smgr_shutdown = NULL, - .smgr_open = zenith_open, - .smgr_close = zenith_close, - .smgr_create = zenith_create, - .smgr_exists = zenith_exists, - .smgr_unlink = zenith_unlink, - .smgr_extend = zenith_extend, - .smgr_prefetch = zenith_prefetch, - .smgr_read = zenith_read, - .smgr_write = zenith_write, - .smgr_writeback = zenith_writeback, - .smgr_nblocks = zenith_nblocks, - .smgr_truncate = zenith_truncate, - .smgr_immedsync = zenith_immedsync, - - .smgr_start_unlogged_build = zenith_start_unlogged_build, - .smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1, - .smgr_end_unlogged_build = zenith_end_unlogged_build, -}; - - -const f_smgr * -smgr_zenith(BackendId backend, RelFileNode rnode) -{ - - /* Don't use page server for temp relations */ - if (backend != InvalidBackendId) - return smgr_standard(backend, rnode); - else - return &zenith_smgr; -} - -void -smgr_init_zenith(void) -{ - RegisterXactCallback(AtEOXact_zenith, NULL); - - smgr_init_standard(); - zenith_init(); -} diff --git a/contrib/neon/relsize_cache.c b/contrib/neon/relsize_cache.c deleted file mode 100644 index 0ff221be22c..00000000000 --- a/contrib/neon/relsize_cache.c +++ /dev/null @@ -1,186 +0,0 @@ -/*------------------------------------------------------------------------- - * - * relsize_cache.c - * Relation size cache for better zentih performance. - * - * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * contrib/neon/relsize_cache.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "pagestore_client.h" -#include "storage/relfilenode.h" -#include "storage/smgr.h" -#include "storage/lwlock.h" -#include "storage/ipc.h" -#include "storage/shmem.h" -#include "catalog/pg_tablespace_d.h" -#include "utils/dynahash.h" -#include "utils/guc.h" -#include "miscadmin.h" - - -typedef struct -{ - RelFileNode rnode; - ForkNumber forknum; -} RelTag; - -typedef struct -{ - RelTag tag; - BlockNumber size; -} RelSizeEntry; - -static HTAB *relsize_hash; -static LWLockId relsize_lock; -static int relsize_hash_size; -static shmem_startup_hook_type prev_shmem_startup_hook = NULL; -static shmem_request_hook_type prev_shmem_request_hook = NULL; - -/* - * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, - * which seems reasonable. - */ -#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) - -static void -zenith_smgr_shmem_startup(void) -{ - static HASHCTL info; - - if (prev_shmem_startup_hook) - prev_shmem_startup_hook(); - - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); - info.keysize = sizeof(RelTag); - info.entrysize = sizeof(RelSizeEntry); - relsize_hash = ShmemInitHash("neon_relsize", - relsize_hash_size, relsize_hash_size, - &info, - HASH_ELEM | HASH_BLOBS); - LWLockRelease(AddinShmemInitLock); -} - -bool -get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size) -{ - bool found = false; - - if (relsize_hash_size > 0) - { - RelTag tag; - RelSizeEntry *entry; - - tag.rnode = rnode; - tag.forknum = forknum; - LWLockAcquire(relsize_lock, LW_SHARED); - entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL); - if (entry != NULL) - { - *size = entry->size; - found = true; - } - LWLockRelease(relsize_lock); - } - return found; -} - -void -set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) -{ - if (relsize_hash_size > 0) - { - RelTag tag; - RelSizeEntry *entry; - - tag.rnode = rnode; - tag.forknum = forknum; - LWLockAcquire(relsize_lock, LW_EXCLUSIVE); - entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); - entry->size = size; - LWLockRelease(relsize_lock); - } -} - -void -update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size) -{ - if (relsize_hash_size > 0) - { - RelTag tag; - RelSizeEntry *entry; - bool found; - - tag.rnode = rnode; - tag.forknum = forknum; - LWLockAcquire(relsize_lock, LW_EXCLUSIVE); - entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); - if (!found || entry->size < size) - entry->size = size; - LWLockRelease(relsize_lock); - } -} - -void -forget_cached_relsize(RelFileNode rnode, ForkNumber forknum) -{ - if (relsize_hash_size > 0) - { - RelTag tag; - - tag.rnode = rnode; - tag.forknum = forknum; - LWLockAcquire(relsize_lock, LW_EXCLUSIVE); - hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); - LWLockRelease(relsize_lock); - } -} - -static void relsize_shmem_request(void); - -void -relsize_hash_init(void) -{ - DefineCustomIntVariable("neon.relsize_hash_size", - "Sets the maximum number of cached relation sizes for neon", - NULL, - &relsize_hash_size, - DEFAULT_RELSIZE_HASH_SIZE, - 0, - INT_MAX, - PGC_POSTMASTER, - 0, - NULL, NULL, NULL); - - if (relsize_hash_size > 0) - { - shmem_request_hook = relsize_shmem_request; - - prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = zenith_smgr_shmem_startup; - } -} - - - -/* - * shmem_request hook: request additional shared resources. We'll allocate or - * attach to the shared resources in pgss_shmem_startup(). - */ -static void -relsize_shmem_request(void) -{ - if (prev_shmem_request_hook) - prev_shmem_request_hook(); - - RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); - RequestNamedLWLockTranche("neon_relsize", 1); -} diff --git a/contrib/neon_test_utils/Makefile b/contrib/neon_test_utils/Makefile deleted file mode 100644 index bd618e6d96e..00000000000 --- a/contrib/neon_test_utils/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -# contrib/neon_test_utils/Makefile - - -MODULE_big = neon_test_utils -OBJS = \ - $(WIN32RES) \ - neontest.o - -EXTENSION = neon_test_utils -DATA = neon_test_utils--1.0.sql -PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" - -EXTRA_INSTALL=contrib/neon - -ifdef USE_PGXS -PG_CONFIG = pg_config -PGXS := $(shell $(PG_CONFIG) --pgxs) -include $(PGXS) -else -PG_CPPFLAGS = -I$(top_srcdir)/contrib -subdir = contrib/neon_test_utils -top_builddir = ../.. -include $(top_builddir)/src/Makefile.global -include $(top_srcdir)/contrib/contrib-global.mk -endif diff --git a/contrib/neon_test_utils/neon_test_utils--1.0.sql b/contrib/neon_test_utils/neon_test_utils--1.0.sql deleted file mode 100644 index 402981a9a66..00000000000 --- a/contrib/neon_test_utils/neon_test_utils--1.0.sql +++ /dev/null @@ -1,29 +0,0 @@ --- complain if script is sourced in psql, rather than via CREATE EXTENSION -\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit - -CREATE FUNCTION test_consume_xids(nxids int) -RETURNS VOID -AS 'MODULE_PATHNAME', 'test_consume_xids' -LANGUAGE C STRICT -PARALLEL UNSAFE; - -CREATE FUNCTION clear_buffer_cache() -RETURNS VOID -AS 'MODULE_PATHNAME', 'clear_buffer_cache' -LANGUAGE C STRICT -PARALLEL UNSAFE; - -CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) -RETURNS bytea -AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' -LANGUAGE C PARALLEL UNSAFE; - -CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) -RETURNS bytea -AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' -LANGUAGE C PARALLEL UNSAFE; - -CREATE FUNCTION neon_xlogflush(lsn pg_lsn) -RETURNS VOID -AS 'MODULE_PATHNAME', 'neon_xlogflush' -LANGUAGE C PARALLEL UNSAFE; diff --git a/contrib/neon_test_utils/neon_test_utils.control b/contrib/neon_test_utils/neon_test_utils.control deleted file mode 100644 index 94e67205039..00000000000 --- a/contrib/neon_test_utils/neon_test_utils.control +++ /dev/null @@ -1,5 +0,0 @@ -# neon_test_utils extension -comment = 'helpers for neon testing and debugging' -default_version = '1.0' -module_pathname = '$libdir/neon_test_utils' -relocatable = true diff --git a/contrib/neon_test_utils/neontest.c b/contrib/neon_test_utils/neontest.c deleted file mode 100644 index a3e730efe27..00000000000 --- a/contrib/neon_test_utils/neontest.c +++ /dev/null @@ -1,304 +0,0 @@ -/*------------------------------------------------------------------------- - * - * neontest.c - * Helpers for neon testing and debugging - * - * IDENTIFICATION - * contrib/neon_test_utils/neontest.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "access/relation.h" -#include "access/xact.h" -#include "access/xlog.h" -#include "catalog/namespace.h" -#include "fmgr.h" -#include "funcapi.h" -#include "miscadmin.h" -#include "storage/buf_internals.h" -#include "storage/bufmgr.h" -#include "utils/builtins.h" -#include "utils/pg_lsn.h" -#include "utils/rel.h" -#include "utils/varlena.h" -#include "neon/pagestore_client.h" - -PG_MODULE_MAGIC; - -extern void _PG_init(void); - -PG_FUNCTION_INFO_V1(test_consume_xids); -PG_FUNCTION_INFO_V1(clear_buffer_cache); -PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); -PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); -PG_FUNCTION_INFO_V1(neon_xlogflush); - -/* - * Linkage to functions in zenith module. - * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c - */ -typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); - -static zenith_read_at_lsn_type zenith_read_at_lsn_ptr; - -/* - * Module initialize function: fetch function pointers for cross-module calls. - */ -void -_PG_init(void) -{ - /* Asserts verify that typedefs above match original declarations */ - AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type); - zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type) - load_external_function("$libdir/neon", "zenith_read_at_lsn", - true, NULL); -} - -#define zenith_read_at_lsn zenith_read_at_lsn_ptr - -/* - * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound. - */ -Datum -test_consume_xids(PG_FUNCTION_ARGS) -{ - int32 nxids = PG_GETARG_INT32(0); - TransactionId topxid; - FullTransactionId fullxid; - TransactionId xid; - TransactionId targetxid; - - /* make sure we have a top-XID first */ - topxid = GetTopTransactionId(); - - xid = ReadNextTransactionId(); - - targetxid = xid + nxids; - while (targetxid < FirstNormalTransactionId) - targetxid++; - - while (TransactionIdPrecedes(xid, targetxid)) - { - fullxid = GetNewTransactionId(true); - xid = XidFromFullTransactionId(fullxid); - elog(DEBUG1, "topxid: %u xid: %u", topxid, xid); - } - - PG_RETURN_VOID(); -} - -/* - * Flush the buffer cache, evicting all pages that are not currently pinned. - */ -Datum -clear_buffer_cache(PG_FUNCTION_ARGS) -{ - bool save_zenith_test_evict; - - /* - * Temporarily set the zenith_test_evict GUC, so that when we pin and - * unpin a buffer, the buffer is evicted. We use that hack to evict all - * buffers, as there is no explicit "evict this buffer" function in the - * buffer manager. - */ - save_zenith_test_evict = zenith_test_evict; - zenith_test_evict = true; - PG_TRY(); - { - /* Scan through all the buffers */ - for (int i = 0; i < NBuffers; i++) - { - BufferDesc *bufHdr; - uint32 buf_state; - Buffer bufferid; - bool isvalid; - RelFileNode rnode; - ForkNumber forknum; - BlockNumber blocknum; - - /* Peek into the buffer header to see what page it holds. */ - bufHdr = GetBufferDescriptor(i); - buf_state = LockBufHdr(bufHdr); - - if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) - isvalid = true; - else - isvalid = false; - bufferid = BufferDescriptorGetBuffer(bufHdr); - rnode = bufHdr->tag.rnode; - forknum = bufHdr->tag.forkNum; - blocknum = bufHdr->tag.blockNum; - - UnlockBufHdr(bufHdr, buf_state); - - /* - * Pin the buffer, and release it again. Because we have - * zenith_test_evict==true, this will evict the page from - * the buffer cache if no one else is holding a pin on it. - */ - if (isvalid) - { - if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid)) - ReleaseBuffer(bufferid); - } - } - } - PG_FINALLY(); - { - /* restore the GUC */ - zenith_test_evict = save_zenith_test_evict; - } - PG_END_TRY(); - - PG_RETURN_VOID(); -} - - -/* - * Reads the page from page server without buffer cache - * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN - * NULL read lsn will result in reading the latest version. - * - * Note: reading latest version will result in waiting for latest changes to reach the page server, - * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page - */ -Datum -get_raw_page_at_lsn(PG_FUNCTION_ARGS) -{ - bytea *raw_page; - ForkNumber forknum; - RangeVar *relrv; - Relation rel; - char *raw_page_data; - text *relname; - text *forkname; - uint32 blkno; - - bool request_latest = PG_ARGISNULL(3); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); - - if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) - PG_RETURN_NULL(); - - relname = PG_GETARG_TEXT_PP(0); - forkname = PG_GETARG_TEXT_PP(1); - blkno = PG_GETARG_UINT32(2); - - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to use raw page functions"))); - - relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); - rel = relation_openrv(relrv, AccessShareLock); - - /* Check that this relation has storage */ - if (rel->rd_rel->relkind == RELKIND_VIEW) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from view \"%s\"", - RelationGetRelationName(rel)))); - if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from composite type \"%s\"", - RelationGetRelationName(rel)))); - if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from foreign table \"%s\"", - RelationGetRelationName(rel)))); - if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from partitioned table \"%s\"", - RelationGetRelationName(rel)))); - if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("cannot get raw page from partitioned index \"%s\"", - RelationGetRelationName(rel)))); - - /* - * Reject attempts to read non-local temporary relations; we would be - * likely to get wrong data since we have no visibility into the owning - * session's local buffers. - */ - if (RELATION_IS_OTHER_TEMP(rel)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot access temporary tables of other sessions"))); - - - forknum = forkname_to_number(text_to_cstring(forkname)); - - /* Initialize buffer to copy to */ - raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); - SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); - raw_page_data = VARDATA(raw_page); - - zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data); - - relation_close(rel, AccessShareLock); - - PG_RETURN_BYTEA_P(raw_page); -} - -/* - * Another option to read a relation page from page server without cache - * this version doesn't validate input and allows reading blocks of dropped relations - * - * Note: reading latest version will result in waiting for latest changes to reach the page server, - * if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page - */ -Datum -get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) -{ - char *raw_page_data; - - if (!superuser()) - ereport(ERROR, - (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), - errmsg("must be superuser to use raw page functions"))); - - if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || - PG_ARGISNULL(3) || PG_ARGISNULL(4)) - PG_RETURN_NULL(); - - { - RelFileNode rnode = { - .spcNode = PG_GETARG_OID(0), - .dbNode = PG_GETARG_OID(1), - .relNode = PG_GETARG_OID(2) - }; - - ForkNumber forknum = PG_GETARG_UINT32(3); - - uint32 blkno = PG_GETARG_UINT32(4); - bool request_latest = PG_ARGISNULL(5); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); - - - /* Initialize buffer to copy to */ - bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); - SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); - raw_page_data = VARDATA(raw_page); - - zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data); - PG_RETURN_BYTEA_P(raw_page); - } -} - -/* - * Directly calls XLogFlush(lsn) to flush WAL buffers. - */ -Datum -neon_xlogflush(PG_FUNCTION_ARGS) -{ - XLogRecPtr lsn = PG_GETARG_LSN(0); - XLogFlush(lsn); - PG_RETURN_VOID(); -} diff --git a/src/Makefile b/src/Makefile index 2f32e3d5137..79e274a4769 100644 --- a/src/Makefile +++ b/src/Makefile @@ -22,7 +22,6 @@ SUBDIRS = \ include \ interfaces \ backend/replication/libpqwalreceiver \ - backend/replication/libpqwalproposer \ backend/replication/pgoutput \ fe_utils \ bin \ diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index fd36e0c5941..185c8f59b55 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -480,7 +480,7 @@ XLogInsert(RmgrId rmid, uint8 info) return EndPos; } - if (backpressure_lag() > 0) + if (delay_backend_us != NULL && delay_backend_us() > 0) { InterruptPending = true; } diff --git a/src/backend/main/main.c b/src/backend/main/main.c index a94f7614c00..37fa0548cff 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -39,7 +39,7 @@ #include "common/username.h" #include "port/atomics.h" #include "postmaster/postmaster.h" -#include "replication/walproposer.h" +#include "replication/walpropshim.h" #include "storage/spin.h" #include "tcop/tcopprot.h" #include "utils/help_config.h" diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 6a644a27534..6afcc10e4fc 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -22,7 +22,7 @@ #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" -#include "replication/walproposer.h" +#include "replication/walpropshim.h" #include "storage/dsm.h" #include "storage/ipc.h" #include "storage/latch.h" @@ -129,9 +129,6 @@ static const struct }, { "ApplyWorkerMain", ApplyWorkerMain - }, - { - "WalProposerMain", WalProposerMain } }; diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index b00ee2184a5..5933bc7ff49 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -119,7 +119,7 @@ #include "postmaster/syslogger.h" #include "replication/logicallauncher.h" #include "replication/walsender.h" -#include "replication/walproposer.h" +#include "replication/walpropshim.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" @@ -1021,11 +1021,6 @@ PostmasterMain(int argc, char *argv[]) */ ApplyLauncherRegister(); - /* - * Start WAL proposer bgworker is wal acceptors list is not empty - */ - WalProposerRegister(); - /* * process any libraries that should be preloaded at postmaster start */ diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile index d35cb2114ca..938b1858127 100644 --- a/src/backend/replication/Makefile +++ b/src/backend/replication/Makefile @@ -23,8 +23,7 @@ OBJS = \ walreceiver.o \ walreceiverfuncs.o \ walsender.o \ - walproposer.o \ - walproposer_utils.o + walpropcompat.o SUBDIRS = logical diff --git a/src/backend/replication/libpqwalproposer/Makefile b/src/backend/replication/libpqwalproposer/Makefile deleted file mode 100644 index c570160536f..00000000000 --- a/src/backend/replication/libpqwalproposer/Makefile +++ /dev/null @@ -1,37 +0,0 @@ -#------------------------------------------------------------------------- -# -# Makefile-- -# Makefile for src/backend/replication/libpqwalproposer -# -# IDENTIFICATION -# src/backend/replication/libpqwalproposer/Makefile -# -#------------------------------------------------------------------------- - -subdir = src/backend/replication/libpqwalproposer -top_builddir = ../../../.. -include $(top_builddir)/src/Makefile.global - -override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS) - -OBJS = \ - $(WIN32RES) \ - libpqwalproposer.o -SHLIB_LINK_INTERNAL = $(libpq) -SHLIB_LINK = $(filter -lintl, $(LIBS)) -SHLIB_PREREQS = submake-libpq -PGFILEDESC = "libpqwalproposer - libpq interface for WAL proposer" -NAME = libpqwalproposer - -all: all-shared-lib - -include $(top_srcdir)/src/Makefile.shlib - -install: all installdirs install-lib - -installdirs: installdirs-lib - -uninstall: uninstall-lib - -clean distclean maintainer-clean: clean-lib - rm -f $(OBJS) diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c deleted file mode 100644 index a12a2ee04bc..00000000000 --- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c +++ /dev/null @@ -1,416 +0,0 @@ -#include "postgres.h" - -#include "replication/walproposer.h" -#include "libpq-fe.h" - -/* Required for anything that's dynamically loaded */ -PG_MODULE_MAGIC; -void _PG_init(void); - -/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */ -struct WalProposerConn -{ - PGconn* pg_conn; - bool is_nonblocking; /* whether the connection is non-blocking */ - char *recvbuf; /* last received data from libpqprop_async_read */ -}; - -/* Prototypes for exported functions */ -static char* libpqprop_error_message(WalProposerConn* conn); -static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn); -static WalProposerConn* libpqprop_connect_start(char* conninfo); -static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn); -static bool libpqprop_send_query(WalProposerConn* conn, char* query); -static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn); -static pgsocket libpqprop_socket(WalProposerConn* conn); -static int libpqprop_flush(WalProposerConn* conn); -static void libpqprop_finish(WalProposerConn* conn); -static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount); -static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size); -static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size); - -static WalProposerFunctionsType PQWalProposerFunctions = { - libpqprop_error_message, - libpqprop_status, - libpqprop_connect_start, - libpqprop_connect_poll, - libpqprop_send_query, - libpqprop_get_query_result, - libpqprop_socket, - libpqprop_flush, - libpqprop_finish, - libpqprop_async_read, - libpqprop_async_write, - libpqprop_blocking_write, -}; - -/* Module initialization */ -void -_PG_init(void) -{ - if (WalProposerFunctions != NULL) - elog(ERROR, "libpqwalproposer already loaded"); - WalProposerFunctions = &PQWalProposerFunctions; -} - -/* Helper function */ -static bool -ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking) -{ - /* If we're already correctly blocking or nonblocking, all good */ - if (is_nonblocking == conn->is_nonblocking) - return true; - - /* Otherwise, set it appropriately */ - if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1) - return false; - - conn->is_nonblocking = is_nonblocking; - return true; -} - -/* Exported function definitions */ -static char* -libpqprop_error_message(WalProposerConn* conn) -{ - return PQerrorMessage(conn->pg_conn); -} - -static WalProposerConnStatusType -libpqprop_status(WalProposerConn* conn) -{ - switch (PQstatus(conn->pg_conn)) - { - case CONNECTION_OK: - return WP_CONNECTION_OK; - case CONNECTION_BAD: - return WP_CONNECTION_BAD; - default: - return WP_CONNECTION_IN_PROGRESS; - } -} - -static WalProposerConn* -libpqprop_connect_start(char* conninfo) -{ - WalProposerConn* conn; - PGconn* pg_conn; - - pg_conn = PQconnectStart(conninfo); - /* - * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the - * behavior of PQconnectStart here. - */ - if (!pg_conn) - return NULL; - - /* - * And in theory this allocation can fail as well, but it's incredibly unlikely if we just - * successfully allocated a PGconn. - * - * palloc will exit on failure though, so there's not much we could do if it *did* fail. - */ - conn = palloc(sizeof(WalProposerConn)); - conn->pg_conn = pg_conn; - conn->is_nonblocking = false; /* connections always start in blocking mode */ - conn->recvbuf = NULL; - return conn; -} - -static WalProposerConnectPollStatusType -libpqprop_connect_poll(WalProposerConn* conn) -{ - WalProposerConnectPollStatusType return_val; - - switch (PQconnectPoll(conn->pg_conn)) - { - case PGRES_POLLING_FAILED: - return_val = WP_CONN_POLLING_FAILED; - break; - case PGRES_POLLING_READING: - return_val = WP_CONN_POLLING_READING; - break; - case PGRES_POLLING_WRITING: - return_val = WP_CONN_POLLING_WRITING; - break; - case PGRES_POLLING_OK: - return_val = WP_CONN_POLLING_OK; - break; - - /* There's a comment at its source about this constant being unused. We'll expect it's never - * returned. */ - case PGRES_POLLING_ACTIVE: - elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll"); - /* This return is never actually reached, but it's here to make the compiler happy */ - return WP_CONN_POLLING_FAILED; - - default: - Assert(false); - return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */ - } - - return return_val; -} - -static bool -libpqprop_send_query(WalProposerConn* conn, char* query) -{ - /* We need to be in blocking mode for sending the query to run without - * requiring a call to PQflush */ - if (!ensure_nonblocking_status(conn, false)) - return false; - - /* PQsendQuery returns 1 on success, 0 on failure */ - if (!PQsendQuery(conn->pg_conn, query)) - return false; - - return true; -} - -static WalProposerExecStatusType -libpqprop_get_query_result(WalProposerConn* conn) -{ - PGresult* result; - WalProposerExecStatusType return_val; - - /* Marker variable if we need to log an unexpected success result */ - char* unexpected_success = NULL; - - /* Consume any input that we might be missing */ - if (!PQconsumeInput(conn->pg_conn)) - return WP_EXEC_FAILED; - - if (PQisBusy(conn->pg_conn)) - return WP_EXEC_NEEDS_INPUT; - - - result = PQgetResult(conn->pg_conn); - /* PQgetResult returns NULL only if getting the result was successful & there's no more of the - * result to get. */ - if (!result) - { - elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results"); - return WP_EXEC_UNEXPECTED_SUCCESS; - } - - /* Helper macro to reduce boilerplate */ - #define UNEXPECTED_SUCCESS(msg) \ - return_val = WP_EXEC_UNEXPECTED_SUCCESS; \ - unexpected_success = msg; \ - break; - - - switch (PQresultStatus(result)) - { - /* "true" success case */ - case PGRES_COPY_BOTH: - return_val = WP_EXEC_SUCCESS_COPYBOTH; - break; - - /* Unexpected success case */ - case PGRES_EMPTY_QUERY: - UNEXPECTED_SUCCESS("empty query return"); - case PGRES_COMMAND_OK: - UNEXPECTED_SUCCESS("data-less command end"); - case PGRES_TUPLES_OK: - UNEXPECTED_SUCCESS("tuples return"); - case PGRES_COPY_OUT: - UNEXPECTED_SUCCESS("'Copy Out' response"); - case PGRES_COPY_IN: - UNEXPECTED_SUCCESS("'Copy In' response"); - case PGRES_SINGLE_TUPLE: - UNEXPECTED_SUCCESS("single tuple return"); - case PGRES_PIPELINE_SYNC: - UNEXPECTED_SUCCESS("pipeline sync point"); - - /* Failure cases */ - case PGRES_BAD_RESPONSE: - case PGRES_NONFATAL_ERROR: - case PGRES_FATAL_ERROR: - case PGRES_PIPELINE_ABORTED: - return_val = WP_EXEC_FAILED; - break; - - default: - Assert(false); - return_val = WP_EXEC_FAILED; /* keep the compiler quiet */ - } - - if (unexpected_success) - elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success); - - return return_val; -} - -static pgsocket -libpqprop_socket(WalProposerConn* conn) -{ - return PQsocket(conn->pg_conn); -} - -static int -libpqprop_flush(WalProposerConn* conn) -{ - return (PQflush(conn->pg_conn)); -} - -static void -libpqprop_finish(WalProposerConn* conn) -{ - if (conn->recvbuf != NULL) - PQfreemem(conn->recvbuf); - PQfinish(conn->pg_conn); - pfree(conn); -} - -/* - * Receive a message from the safekeeper. - * - * On success, the data is placed in *buf. It is valid until the next call - * to this function. - */ -static PGAsyncReadResult -libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount) -{ - int result; - - if (conn->recvbuf != NULL) - { - PQfreemem(conn->recvbuf); - conn->recvbuf = NULL; - } - - /* Call PQconsumeInput so that we have the data we need */ - if (!PQconsumeInput(conn->pg_conn)) - { - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - } - - /* The docs for PQgetCopyData list the return values as: - * 0 if the copy is still in progress, but no "complete row" is - * available - * -1 if the copy is done - * -2 if an error occured - * (> 0) if it was successful; that value is the amount transferred. - * - * The protocol we use between walproposer and safekeeper means that we - * *usually* wouldn't expect to see that the copy is done, but this can - * sometimes be triggered by the server returning an ErrorResponse (which - * also happens to have the effect that the copy is done). - */ - switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true)) - { - case 0: - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_TRY_AGAIN; - case -1: - { - /* - * If we get -1, it's probably because of a server error; the - * safekeeper won't normally send a CopyDone message. - * - * We can check PQgetResult to make sure that the server failed; - * it'll always result in PGRES_FATAL_ERROR - */ - ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn)); - - if (status != PGRES_FATAL_ERROR) - elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status); - - /* If there was actually an error, it'll be properly reported by - * calls to PQerrorMessage -- we don't have to do anything else */ - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - } - case -2: - *amount = 0; - *buf = NULL; - return PG_ASYNC_READ_FAIL; - default: - /* Positive values indicate the size of the returned result */ - *amount = result; - *buf = conn->recvbuf; - return PG_ASYNC_READ_SUCCESS; - } -} - -static PGAsyncWriteResult -libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size) -{ - int result; - - /* If we aren't in non-blocking mode, switch to it. */ - if (!ensure_nonblocking_status(conn, true)) - return PG_ASYNC_WRITE_FAIL; - - /* The docs for PQputcopyData list the return values as: - * 1 if the data was queued, - * 0 if it was not queued because of full buffers, or - * -1 if an error occured - */ - result = PQputCopyData(conn->pg_conn, buf, size); - - /* We won't get a result of zero because walproposer always empties the - * connection's buffers before sending more */ - Assert(result != 0); - - switch (result) - { - case 1: - /* good -- continue */ - break; - case -1: - return PG_ASYNC_WRITE_FAIL; - default: - elog(FATAL, "invalid return %d from PQputCopyData", result); - } - - /* After queueing the data, we still need to flush to get it to send. - * This might take multiple tries, but we don't want to wait around - * until it's done. - * - * PQflush has the following returns (directly quoting the docs): - * 0 if sucessful, - * 1 if it was unable to send all the data in the send queue yet - * -1 if it failed for some reason - */ - switch (result = PQflush(conn->pg_conn)) { - case 0: - return PG_ASYNC_WRITE_SUCCESS; - case 1: - return PG_ASYNC_WRITE_TRY_FLUSH; - case -1: - return PG_ASYNC_WRITE_FAIL; - default: - elog(FATAL, "invalid return %d from PQflush", result); - } -} - -static bool -libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size) -{ - int result; - - /* If we are in non-blocking mode, switch out of it. */ - if (!ensure_nonblocking_status(conn, false)) - return false; - - /* Ths function is very similar to libpqprop_async_write. For more - * information, refer to the comments there */ - if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1) - return false; - - Assert(result == 1); - - /* Because the connection is non-blocking, flushing returns 0 or -1 */ - - if ((result = PQflush(conn->pg_conn)) == -1) - return false; - - Assert(result == 0); - return true; -} diff --git a/src/backend/replication/walpropcompat.c b/src/backend/replication/walpropcompat.c new file mode 100644 index 00000000000..49711c45b4c --- /dev/null +++ b/src/backend/replication/walpropcompat.c @@ -0,0 +1,95 @@ +#include "postgres.h" + +#include +#include +#include + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogdefs.h" +#include "miscadmin.h" +#include "postmaster/bgworker.h" +#include "postmaster/postmaster.h" +#include "storage/fd.h" +#include "utils/guc.h" +#include "replication/walpropshim.h" + +bool syncSafekeepers = false; +void (*WalProposerInit) (XLogRecPtr flushRecPtr, uint64 systemId) = NULL; +void (*WalProposerStart) (void) = NULL; + +/* + * Entry point for `postgres --sync-safekeepers`. + */ +void +WalProposerSync(int argc, char *argv[]) +{ + struct stat stat_buf; + + syncSafekeepers = true; + + InitStandaloneProcess(argv[0]); + + SetProcessingMode(InitProcessing); + + /* + * Set default values for command-line options. + */ + InitializeGUCOptions(); + + /* Acquire configuration parameters */ + if (!SelectConfigFiles(NULL, progname)) + exit(1); + + /* + * Imitate we are early in bootstrap loading shared_preload_libraries; + * zenith extension sets PGC_POSTMASTER gucs requiring this. + */ + process_shared_preload_libraries_in_progress = true; + + /* + * Initialize postmaster_alive_fds as WaitEventSet checks them. + * + * Copied from InitPostmasterDeathWatchHandle() + */ + if (pipe(postmaster_alive_fds) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); + + ChangeToDataDir(); + + /* Create pg_wal directory, if it doesn't exist */ + if (stat(XLOGDIR, &stat_buf) != 0) + { + ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); + if (MakePGDirectory(XLOGDIR) < 0) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + XLOGDIR))); + exit(1); + } + } + + load_file("neon", false); + + if (NULL == WalProposerInit) + elog(ERROR, "Neon failed to register WalProposerInit"); + + if (NULL == WalProposerStart) + elog(ERROR, "Neon failed to register WalProposerStart"); + + WalProposerInit(0, 0); + + process_shared_preload_libraries_in_progress = false; + + BackgroundWorkerUnblockSignals(); + + WalProposerStart(); +} diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c deleted file mode 100644 index a72f63509bf..00000000000 --- a/src/backend/replication/walproposer.c +++ /dev/null @@ -1,2357 +0,0 @@ -/*------------------------------------------------------------------------- - * - * walproposer.c - * - * Proposer/leader part of the total order broadcast protocol between postgres - * and WAL safekeepers. - * - * We have two ways of launching WalProposer: - * - * 1. As a background worker which will run physical WalSender with - * am_wal_proposer flag set to true. WalSender in turn would handle WAL - * reading part and call WalProposer when ready to scatter WAL. - * - * 2. As a standalone utility by running `postgres --sync-safekeepers`. That - * is needed to create LSN from which it is safe to start postgres. More - * specifically it addresses following problems: - * - * a) Chicken-or-the-egg problem: compute postgres needs data directory - * with non-rel files that are downloaded from pageserver by calling - * basebackup@LSN. This LSN is not arbitrary, it must include all - * previously committed transactions and defined through consensus - * voting, which happens... in walproposer, a part of compute node. - * - * b) Just warranting such LSN is not enough, we must also actually commit - * it and make sure there is a safekeeper who knows this LSN is - * committed so WAL before it can be streamed to pageserver -- otherwise - * basebackup will hang waiting for WAL. Advancing commit_lsn without - * playing consensus game is impossible, so speculative 'let's just poll - * safekeepers, learn start LSN of future epoch and run basebackup' - * won't work. - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include -#include -#include -#include "access/xlogdefs.h" -#include "access/xlogutils.h" -#include "access/xlogrecovery.h" -#include "replication/walproposer.h" -#include "storage/latch.h" -#include "storage/fd.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "access/xlog.h" -#include "libpq/pqformat.h" -#include "replication/slot.h" -#include "replication/walreceiver.h" -#include "postmaster/bgworker.h" -#include "postmaster/interrupt.h" -#include "postmaster/postmaster.h" -#include "storage/pmsignal.h" -#include "storage/proc.h" -#include "tcop/tcopprot.h" -#include "utils/builtins.h" -#include "utils/memutils.h" -#include "utils/timestamp.h" - - -char *wal_acceptors_list; -int wal_acceptor_reconnect_timeout; -int wal_acceptor_connect_timeout; -bool am_wal_proposer; - -char *zenith_timeline_walproposer = NULL; -char *zenith_tenant_walproposer = NULL; -char *zenith_pageserver_connstring_walproposer = NULL; - -/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */ -WalProposerFunctionsType *WalProposerFunctions = NULL; - -#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot" - -static int n_safekeepers = 0; -static int quorum = 0; -static Safekeeper safekeeper[MAX_SAFEKEEPERS]; -static XLogRecPtr availableLsn; /* WAL has been generated up to this point */ -static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */ -static ProposerGreeting greetRequest; -static VoteRequest voteRequest; /* Vote request for safekeeper */ -static WaitEventSet *waitEvents; -static AppendResponse quorumFeedback; -/* - * Minimal LSN which may be needed for recovery of some safekeeper, - * record-aligned (first record which might not yet received by someone). - */ -static XLogRecPtr truncateLsn; -/* - * Term of the proposer. We want our term to be highest and unique, - * so we collect terms from safekeepers quorum, choose max and +1. - * After that our term is fixed and must not change. If we observe - * that some safekeeper has higher term, it means that we have another - * running compute, so we must stop immediately. - */ -static term_t propTerm; -static TermHistory propTermHistory; /* term history of the proposer */ -static XLogRecPtr propEpochStartLsn; /* epoch start lsn of the proposer */ -static term_t donorEpoch; /* Most advanced acceptor epoch */ -static int donor; /* Most advanced acceptor */ -static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ -static int n_votes = 0; -static int n_connected = 0; -static TimestampTz last_reconnect_attempt; - -/* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */ -static bool syncSafekeepers; - -static WalproposerShmemState *walprop_shared; - -/* Prototypes for private functions */ -static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId, TimeLineID tli); -static void WalProposerStart(void); -static void WalProposerLoop(void); -static void InitEventSet(void); -static void UpdateEventSet(Safekeeper *sk, uint32 events); -static void HackyRemoveWalProposerEvent(Safekeeper *to_remove); -static void ShutdownConnection(Safekeeper *sk); -static void ResetConnection(Safekeeper *sk); -static long TimeToReconnect(TimestampTz now); -static void ReconnectSafekeepers(void); -static void AdvancePollState(Safekeeper *sk, uint32 events); -static void HandleConnectionEvent(Safekeeper *sk); -static void SendStartWALPush(Safekeeper *sk); -static void RecvStartWALPushResult(Safekeeper *sk); -static void SendProposerGreeting(Safekeeper *sk); -static void RecvAcceptorGreeting(Safekeeper *sk); -static void SendVoteRequest(Safekeeper *sk); -static void RecvVoteResponse(Safekeeper *sk); -static void HandleElectedProposer(void); -static term_t GetHighestTerm(TermHistory *th); -static term_t GetEpoch(Safekeeper *sk); -static void DetermineEpochStartLsn(void); -static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos); -static void SendProposerElected(Safekeeper *sk); -static void WalProposerStartStreaming(XLogRecPtr startpos); -static void StartStreaming(Safekeeper *sk); -static void SendMessageToNode(Safekeeper *sk); -static void BroadcastAppendRequest(void); -static void HandleActiveState(Safekeeper *sk, uint32 events); -static bool SendAppendRequests(Safekeeper *sk); -static bool RecvAppendResponses(Safekeeper *sk); -static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs); -static XLogRecPtr CalculateMinFlushLsn(void); -static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void); -static void HandleSafekeeperResponse(void); -static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); -static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); -static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); -static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state); -static bool AsyncFlush(Safekeeper *sk); - - -/* - * WAL proposer bgworker entry point. - */ -void -WalProposerMain(Datum main_arg) -{ - TimeLineID tli; - - /* Establish signal handlers. */ - pqsignal(SIGUSR1, procsignal_sigusr1_handler); - pqsignal(SIGHUP, SignalHandlerForConfigReload); - pqsignal(SIGTERM, die); - - BackgroundWorkerUnblockSignals(); - - GetXLogReplayRecPtr(&tli); - - WalProposerInit(GetFlushRecPtr(NULL), GetSystemIdentifier(), tli); - - last_reconnect_attempt = GetCurrentTimestamp(); - - application_name = (char *) "walproposer"; /* for - * synchronous_standby_names */ - am_wal_proposer = true; - am_walsender = true; - InitWalSender(); - InitProcessPhase2(); - - /* Create replication slot for WAL proposer if not exists */ - if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL) - { - ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false); - ReplicationSlotReserveWal(); - /* Write this slot to disk */ - ReplicationSlotMarkDirty(); - ReplicationSlotSave(); - ReplicationSlotRelease(); - } - - WalProposerStart(); -} - -/* - * Entry point for `postgres --sync-safekeepers`. - */ -void -WalProposerSync(int argc, char *argv[]) -{ - struct stat stat_buf; - // FIXME Write a comment, why this hardcoded value is safe - TimeLineID tli = 1; - - syncSafekeepers = true; - - InitStandaloneProcess(argv[0]); - - SetProcessingMode(InitProcessing); - - /* - * Set default values for command-line options. - */ - InitializeGUCOptions(); - - /* Acquire configuration parameters */ - if (!SelectConfigFiles(NULL, progname)) - exit(1); - - /* - * Imitate we are early in bootstrap loading shared_preload_libraries; - * zenith extension sets PGC_POSTMASTER gucs requiring this. - */ - process_shared_preload_libraries_in_progress = true; - - /* - * Initialize postmaster_alive_fds as WaitEventSet checks them. - * - * Copied from InitPostmasterDeathWatchHandle() - */ - if (pipe(postmaster_alive_fds) < 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg_internal("could not create pipe to monitor postmaster death: %m"))); - if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) - ereport(FATAL, - (errcode_for_socket_access(), - errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); - - ChangeToDataDir(); - - /* Create pg_wal directory, if it doesn't exist */ - if (stat(XLOGDIR, &stat_buf) != 0) - { - ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); - if (MakePGDirectory(XLOGDIR) < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create directory \"%s\": %m", - XLOGDIR))); - exit(1); - } - } - - WalProposerInit(0, 0, tli); - - process_shared_preload_libraries_in_progress = false; - - BackgroundWorkerUnblockSignals(); - - WalProposerStart(); -} - -/* - * Create new AppendRequest message and start sending it. This function is - * called from walsender every time the new WAL is available. - */ -void -WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos) -{ - Assert(startpos == availableLsn && endpos >= availableLsn); - availableLsn = endpos; - BroadcastAppendRequest(); -} - -/* - * Advance the WAL proposer state machine, waiting each time for events to occur. - * Will exit only when latch is set, i.e. new WAL should be pushed from walsender - * to walproposer. - */ -void -WalProposerPoll(void) -{ - while (true) - { - Safekeeper *sk; - int rc; - WaitEvent event; - TimestampTz now = GetCurrentTimestamp(); - - rc = WaitEventSetWait(waitEvents, TimeToReconnect(now), - &event, 1, WAIT_EVENT_WAL_SENDER_MAIN); - sk = (Safekeeper *) event.user_data; - - /* - * If the event contains something that one of our safekeeper states - * was waiting for, we'll advance its state. - */ - if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))) - AdvancePollState(sk, event.events); - - /* - * If the timeout expired, attempt to reconnect to any safekeepers that - * we dropped - */ - ReconnectSafekeepers(); - - /* - * If wait is terminated by latch set (walsenders' latch is set on - * each wal flush), then exit loop. (no need for pm death check due to - * WL_EXIT_ON_PM_DEATH) - */ - if (rc != 0 && (event.events & WL_LATCH_SET)) - { - ResetLatch(MyLatch); - break; - } - if (rc == 0) /* timeout expired: poll state */ - { - TimestampTz now; - - /* - * If no WAL was generated during timeout (and we have already - * collected the quorum), then send pool message - */ - if (availableLsn != InvalidXLogRecPtr) - { - BroadcastAppendRequest(); - } - - /* - * Abandon connection attempts which take too long. - */ - now = GetCurrentTimestamp(); - for (int i = 0; i < n_safekeepers; i++) - { - Safekeeper *sk = &safekeeper[i]; - - if ((sk->state == SS_CONNECTING_WRITE || - sk->state == SS_CONNECTING_READ) && - TimestampDifferenceExceeds(sk->startedConnAt, now, - wal_acceptor_connect_timeout)) - { - elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms", - sk->host, sk->port, wal_acceptor_connect_timeout); - ShutdownConnection(sk); - } - } - } - } -} - -/* - * Register a background worker proposing WAL to wal acceptors. - */ -void -WalProposerRegister(void) -{ - BackgroundWorker bgw; - - if (*wal_acceptors_list == '\0') - return; - - memset(&bgw, 0, sizeof(bgw)); - bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; - bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; - snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres"); - snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); - snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); - snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); - bgw.bgw_restart_time = 5; - bgw.bgw_notify_pid = 0; - bgw.bgw_main_arg = (Datum) 0; - - RegisterBackgroundWorker(&bgw); -} - -static void -WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId, TimeLineID tli) -{ - char *host; - char *sep; - char *port; - - /* Load the libpq-specific functions */ - load_file("libpqwalproposer", false); - if (WalProposerFunctions == NULL) - elog(ERROR, "libpqwalproposer didn't initialize correctly"); - - load_file("libpqwalreceiver", false); - if (WalReceiverFunctions == NULL) - elog(ERROR, "libpqwalreceiver didn't initialize correctly"); - load_file("neon", false); - - for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep) - { - port = strchr(host, ':'); - if (port == NULL) - { - elog(FATAL, "port is not specified"); - } - *port++ = '\0'; - sep = strchr(port, ','); - if (sep != NULL) - *sep++ = '\0'; - if (n_safekeepers + 1 >= MAX_SAFEKEEPERS) - { - elog(FATAL, "Too many safekeepers"); - } - safekeeper[n_safekeepers].host = host; - safekeeper[n_safekeepers].port = port; - safekeeper[n_safekeepers].state = SS_OFFLINE; - safekeeper[n_safekeepers].conn = NULL; - - /* - * Set conninfo to empty. We'll fill it out once later, in - * `ResetConnection` as needed - */ - safekeeper[n_safekeepers].conninfo[0] = '\0'; - initStringInfo(&safekeeper[n_safekeepers].outbuf); - safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL); - if (safekeeper[n_safekeepers].xlogreader == NULL) - elog(FATAL, "Failed to allocate xlog reader"); - safekeeper[n_safekeepers].flushWrite = false; - safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr; - safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr; - n_safekeepers += 1; - } - if (n_safekeepers < 1) - { - elog(FATAL, "Safekeepers addresses are not specified"); - } - quorum = n_safekeepers / 2 + 1; - - /* Fill the greeting package */ - greetRequest.tag = 'g'; - greetRequest.protocolVersion = SK_PROTOCOL_VERSION; - greetRequest.pgVersion = PG_VERSION_NUM; - pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId)); - greetRequest.systemId = systemId; - if (!zenith_timeline_walproposer) - elog(FATAL, "neon.timeline_id is not provided"); - if (*zenith_timeline_walproposer != '\0' && - !HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16)) - elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer); - if (!zenith_tenant_walproposer) - elog(FATAL, "neon.tenant_id is not provided"); - if (*zenith_tenant_walproposer != '\0' && - !HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16)) - elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer); - - greetRequest.timeline = tli; - greetRequest.walSegSize = wal_segment_size; - - InitEventSet(); -} - -static void -WalProposerStart(void) -{ - - /* Initiate connections to all safekeeper nodes */ - for (int i = 0; i < n_safekeepers; i++) - { - ResetConnection(&safekeeper[i]); - } - - WalProposerLoop(); -} - -static void -WalProposerLoop(void) -{ - while (true) - WalProposerPoll(); -} - -/* Initializes the internal event set, provided that it is currently null */ -static void -InitEventSet(void) -{ - if (waitEvents) - elog(FATAL, "double-initialization of event set"); - - waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers); - AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET, - MyLatch, NULL); - AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, - NULL, NULL); -} - -/* - * Updates the events we're already waiting on for the safekeeper, setting it to - * the provided `events` - * - * This function is called any time the safekeeper's state switches to one where - * it has to wait to continue. This includes the full body of AdvancePollState - * and calls to IO helper functions. - */ -static void -UpdateEventSet(Safekeeper *sk, uint32 events) -{ - /* eventPos = -1 when we don't have an event */ - Assert(sk->eventPos != -1); - - ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL); -} - -/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set. - * - * Note: Internally, this completely reconstructs the event set. It should be avoided if possible. - */ -static void -HackyRemoveWalProposerEvent(Safekeeper *to_remove) -{ - /* Remove the existing event set */ - if (waitEvents) - { - FreeWaitEventSet(waitEvents); - waitEvents = NULL; - } - /* Re-initialize it without adding any safekeeper events */ - InitEventSet(); - - /* - * loop through the existing safekeepers. If they aren't the one we're - * removing, and if they have a socket we can use, re-add the applicable - * events. - */ - for (int i = 0; i < n_safekeepers; i++) - { - uint32 desired_events = WL_NO_EVENTS; - Safekeeper *sk = &safekeeper[i]; - - sk->eventPos = -1; - - if (sk == to_remove) - continue; - - /* If this safekeeper isn't offline, add an event for it! */ - if (sk->conn != NULL) - { - desired_events = SafekeeperStateDesiredEvents(sk->state); - sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk); - } - } -} - -/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */ -static void -ShutdownConnection(Safekeeper *sk) -{ - if (sk->conn) - walprop_finish(sk->conn); - sk->conn = NULL; - sk->state = SS_OFFLINE; - sk->flushWrite = false; - sk->streamingAt = InvalidXLogRecPtr; - - if (sk->voteResponse.termHistory.entries) - pfree(sk->voteResponse.termHistory.entries); - sk->voteResponse.termHistory.entries = NULL; - - HackyRemoveWalProposerEvent(sk); -} - -/* - * This function is called to establish new connection or to reestablish - * connection in case of connection failure. - * - * On success, sets the state to SS_CONNECTING_WRITE. - */ -static void -ResetConnection(Safekeeper *sk) -{ - pgsocket sock; /* socket of the new connection */ - - if (sk->state != SS_OFFLINE) - { - ShutdownConnection(sk); - } - - /* - * Try to establish new connection - * - * If the connection information hasn't been filled out, we need to do - * that here. - */ - if (sk->conninfo[0] == '\0') - { - int written = 0; - written = snprintf((char *) &sk->conninfo, MAXCONNINFO, - "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer); - // currently connection string is not that long, but once we pass something like jwt we might overflow the buffer, - // so it is better to be defensive and check that everything aligns well - if (written > MAXCONNINFO || written < 0) - elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port); - } - - sk->conn = walprop_connect_start((char *) &sk->conninfo); - - /* - * "If the result is null, then libpq has been unable to allocate a new - * PGconn structure" - */ - if (!sk->conn) - elog(FATAL, "failed to allocate new PGconn object"); - - /* - * PQconnectStart won't actually start connecting until we run - * PQconnectPoll. Before we do that though, we need to check that it - * didn't immediately fail. - */ - if (walprop_status(sk->conn) == WP_CONNECTION_BAD) - { - /*--- - * According to libpq docs: - * "If the result is CONNECTION_BAD, the connection attempt has already failed, - * typically because of invalid connection parameters." - * We should report this failure. - * - * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS - */ - elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s", - sk->conninfo, walprop_error_message(sk->conn)); - - /* - * Even though the connection failed, we still need to clean up the - * object - */ - walprop_finish(sk->conn); - sk->conn = NULL; - return; - } - - /* - * The documentation for PQconnectStart states that we should call - * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or - * PGRES_POLLING_FAILED. The other two possible returns indicate whether - * we should wait for reading or writing on the socket. For the first - * iteration of the loop, we're expected to wait until the socket becomes - * writable. - * - * The wording of the documentation is a little ambiguous; thankfully - * there's an example in the postgres source itself showing this behavior. - * (see libpqrcv_connect, defined in - * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c) - */ - elog(LOG, "connecting with node %s:%s", sk->host, sk->port); - - sk->state = SS_CONNECTING_WRITE; - sk->startedConnAt = GetCurrentTimestamp(); - - sock = walprop_socket(sk->conn); - sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk); - return; -} - -/* - * How much milliseconds left till we should attempt reconnection to - * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect - * (do we actually need this?). - */ -static long -TimeToReconnect(TimestampTz now) -{ - TimestampTz passed; - TimestampTz till_reconnect; - - if (wal_acceptor_reconnect_timeout <= 0) - return -1; - - passed = now - last_reconnect_attempt; - till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed; - if (till_reconnect <= 0) - return 0; - return (long) (till_reconnect / 1000); -} - -/* If the timeout has expired, attempt to reconnect to all offline safekeepers */ -static void -ReconnectSafekeepers(void) -{ - TimestampTz now = GetCurrentTimestamp(); - - if (TimeToReconnect(now) == 0) - { - last_reconnect_attempt = now; - for (int i = 0; i < n_safekeepers; i++) - { - if (safekeeper[i].state == SS_OFFLINE) - ResetConnection(&safekeeper[i]); - } - } -} - -/* - * Performs the logic for advancing the state machine of the specified safekeeper, - * given that a certain set of events has occured. - */ -static void -AdvancePollState(Safekeeper *sk, uint32 events) -{ - /* - * Sanity check. We assume further down that the operations don't - * block because the socket is ready. - */ - AssertEventsOkForState(events, sk); - - /* Execute the code corresponding to the current state */ - switch (sk->state) - { - /* - * safekeepers are only taken out of SS_OFFLINE by calls to - * ResetConnection - */ - case SS_OFFLINE: - elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline", - sk->host, sk->port); - break; /* actually unreachable, but prevents - * -Wimplicit-fallthrough */ - - /* - * Both connecting states run the same logic. The only - * difference is the events they're expecting - */ - case SS_CONNECTING_READ: - case SS_CONNECTING_WRITE: - HandleConnectionEvent(sk); - break; - - /* - * Waiting for a successful CopyBoth response. - */ - case SS_WAIT_EXEC_RESULT: - RecvStartWALPushResult(sk); - break; - - /* - * Finish handshake comms: receive information about the safekeeper. - */ - case SS_HANDSHAKE_RECV: - RecvAcceptorGreeting(sk); - break; - - /* - * Voting is an idle state - we don't expect any events to trigger. - * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are - * transferred from SS_VOTING to sending actual vote requests. - */ - case SS_VOTING: - elog(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); - ResetConnection(sk); - return; - - /* Read the safekeeper response for our candidate */ - case SS_WAIT_VERDICT: - RecvVoteResponse(sk); - break; - - /* Flush proposer announcement message */ - case SS_SEND_ELECTED_FLUSH: - - /* - * AsyncFlush ensures we only move on to SS_ACTIVE once the flush - * completes. If we still have more to do, we'll wait until the next - * poll comes along. - */ - if (!AsyncFlush(sk)) - return; - - /* flush is done, event set and state will be updated later */ - StartStreaming(sk); - break; - - /* - * Idle state for waiting votes from quorum. - */ - case SS_IDLE: - elog(WARNING, "EOF from node %s:%s in %s state", sk->host, - sk->port, FormatSafekeeperState(sk->state)); - ResetConnection(sk); - return; - - /* - * Active state is used for streaming WAL and receiving feedback. - */ - case SS_ACTIVE: - HandleActiveState(sk, events); - break; - } -} - -static void -HandleConnectionEvent(Safekeeper *sk) -{ - WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn); - - /* The new set of events we'll wait on, after updating */ - uint32 new_events = WL_NO_EVENTS; - - switch (result) - { - case WP_CONN_POLLING_OK: - elog(LOG, "connected with node %s:%s", sk->host, - sk->port); - - /* - * We have to pick some event to update event set. - * We'll eventually need the socket to be readable, - * so we go with that. - */ - new_events = WL_SOCKET_READABLE; - break; - - /* - * If we need to poll to finish connecting, - * continue doing that - */ - case WP_CONN_POLLING_READING: - sk->state = SS_CONNECTING_READ; - new_events = WL_SOCKET_READABLE; - break; - case WP_CONN_POLLING_WRITING: - sk->state = SS_CONNECTING_WRITE; - new_events = WL_SOCKET_WRITEABLE; - break; - - case WP_CONN_POLLING_FAILED: - elog(WARNING, "failed to connect to node '%s:%s': %s", - sk->host, sk->port, walprop_error_message(sk->conn)); - - /* - * If connecting failed, we don't want to restart - * the connection because that might run us into a - * loop. Instead, shut it down -- it'll naturally - * restart at a slower interval on calls to - * ReconnectSafekeepers. - */ - ShutdownConnection(sk); - return; - } - - /* - * Because PQconnectPoll can change the socket, we have to - * un-register the old event and re-register an event on - * the new socket. - */ - HackyRemoveWalProposerEvent(sk); - sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk); - - /* If we successfully connected, send START_WAL_PUSH query */ - if (result == WP_CONN_POLLING_OK) - SendStartWALPush(sk); -} - -/* - * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs - * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something - * goes wrong, change state to SS_OFFLINE and shutdown the connection. - */ -static void -SendStartWALPush(Safekeeper *sk) -{ - char *query = NULL; - if (zenith_pageserver_connstring_walproposer != NULL) { - query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer); - } else { - query = psprintf("START_WAL_PUSH"); - } - if (!walprop_send_query(sk->conn, query)) - { - pfree(query); - elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); - ShutdownConnection(sk); - return; - } - pfree(query); - sk->state = SS_WAIT_EXEC_RESULT; - UpdateEventSet(sk, WL_SOCKET_READABLE); -} - -static void -RecvStartWALPushResult(Safekeeper *sk) -{ - switch (walprop_get_query_result(sk->conn)) - { - /* - * Successful result, move on to starting the - * handshake - */ - case WP_EXEC_SUCCESS_COPYBOTH: - - SendProposerGreeting(sk); - break; - - /* - * Needs repeated calls to finish. Wait until the - * socket is readable - */ - case WP_EXEC_NEEDS_INPUT: - - /* - * SS_WAIT_EXEC_RESULT is always reached through an - * event, so we don't need to update the event set - */ - break; - - case WP_EXEC_FAILED: - elog(WARNING, "Failed to send query to safekeeper %s:%s: %s", - sk->host, sk->port, walprop_error_message(sk->conn)); - ShutdownConnection(sk); - return; - - /* - * Unexpected result -- funamdentally an error, but we - * want to produce a custom message, rather than a - * generic "something went wrong" - */ - case WP_EXEC_UNEXPECTED_SUCCESS: - elog(WARNING, "Received bad response from safekeeper %s:%s query execution", - sk->host, sk->port); - ShutdownConnection(sk); - return; - } -} - -/* - * Start handshake: first of all send information about the - * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for - * a response to finish the handshake. - */ -static void -SendProposerGreeting(Safekeeper *sk) -{ - /* - * On failure, logging & resetting the connection is handled. - * We just need to handle the control flow. - */ - BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV); -} - -static void -RecvAcceptorGreeting(Safekeeper *sk) -{ - /* - * If our reading doesn't immediately succeed, any necessary - * error handling or state setting is taken care of. We can - * leave any other work until later. - */ - sk->greetResponse.apm.tag = 'g'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) - return; - - /* Protocol is all good, move to voting. */ - sk->state = SS_VOTING; - - ++n_connected; - if (n_connected <= quorum) - { - /* We're still collecting terms from the majority. */ - propTerm = Max(sk->greetResponse.term, propTerm); - - /* Quorum is acquried, prepare the vote request. */ - if (n_connected == quorum) - { - propTerm++; - elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm); - - voteRequest = (VoteRequest) - { - .tag = 'v', - .term = propTerm - }; - memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN); - } - } - else if (sk->greetResponse.term > propTerm) - { - /* Another compute with higher term is running. */ - elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->greetResponse.term, propTerm); - } - - /* - * Check if we have quorum. If there aren't enough safekeepers, - * wait and do nothing. We'll eventually get a task when the - * election starts. - * - * If we do have quorum, we can start an election. - */ - if (n_connected < quorum) - { - /* - * SS_VOTING is an idle state; read-ready indicates the - * connection closed. - */ - UpdateEventSet(sk, WL_SOCKET_READABLE); - } - else - { - /* - * Now send voting request to the cohort and wait - * responses - */ - for (int j = 0; j < n_safekeepers; j++) - { - /* - * Remember: SS_VOTING indicates that the safekeeper is - * participating in voting, but hasn't sent anything - * yet. - */ - if (safekeeper[j].state == SS_VOTING) - SendVoteRequest(&safekeeper[j]); - } - } -} - -static void -SendVoteRequest(Safekeeper *sk) -{ - /* We have quorum for voting, send our vote request */ - elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term); - /* On failure, logging & resetting is handled */ - if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT)) - return; - - /* If successful, wait for read-ready with SS_WAIT_VERDICT */ -} - -static void -RecvVoteResponse(Safekeeper *sk) -{ - sk->voteResponse.apm.tag = 'v'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse)) - return; - - elog(LOG, - "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), - LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), - LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn), - LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn)); - - /* - * In case of acceptor rejecting our vote, bail out, but only - * if either it already lives in strictly higher term - * (concurrent compute spotted) or we are not elected yet and - * thus need the vote. - */ - if ((!sk->voteResponse.voteGiven) && - (sk->voteResponse.term > propTerm || n_votes < quorum)) - { - elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "", - sk->host, sk->port, - sk->voteResponse.term, propTerm); - } - Assert(sk->voteResponse.term == propTerm); - - /* Handshake completed, do we have quorum? */ - n_votes++; - if (n_votes < quorum) - { - sk->state = SS_IDLE; /* can't do much yet, no quorum */ - } - else if (n_votes > quorum) - { - /* recovery already performed, just start streaming */ - SendProposerElected(sk); - } - else - { - sk->state = SS_IDLE; - UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for - * read-ready */ - - HandleElectedProposer(); - } -} - -/* - * Called once a majority of acceptors have voted for us and current proposer - * has been elected. - * - * Sends ProposerElected message to all acceptors in SS_IDLE state and starts - * replication from walsender. - */ -static void -HandleElectedProposer(void) -{ - DetermineEpochStartLsn(); - - /* - * Check if not all safekeepers are up-to-date, we need to - * download WAL needed to synchronize them - */ - if (truncateLsn < propEpochStartLsn) - { - elog(LOG, - "start recovery because truncateLsn=%X/%X is not " - "equal to epochStartLsn=%X/%X", - LSN_FORMAT_ARGS(truncateLsn), - LSN_FORMAT_ARGS(propEpochStartLsn)); - /* Perform recovery */ - if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn)) - elog(FATAL, "Failed to recover state"); - } - else if (syncSafekeepers) - { - /* Sync is not needed: just exit */ - fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); - exit(0); - } - - for (int i = 0; i < n_safekeepers; i++) - { - if (safekeeper[i].state == SS_IDLE) - SendProposerElected(&safekeeper[i]); - } - - /* - * The proposer has been elected, and there will be no quorum waiting - * after this point. There will be no safekeeper with state SS_IDLE - * also, because that state is used only for quorum waiting. - */ - - if (syncSafekeepers) - { - /* - * Send empty message to enforce receiving feedback - * even from nodes who are fully recovered; this is - * required to learn they switched epoch which finishes - * sync-safeekepers who doesn't generate any real new - * records. Will go away once we switch to async acks. - */ - BroadcastAppendRequest(); - - /* keep polling until all safekeepers are synced */ - return; - } - - WalProposerStartStreaming(propEpochStartLsn); - /* Should not return here */ -} - -/* latest term in TermHistory, or 0 is there is no entries */ -static term_t -GetHighestTerm(TermHistory *th) -{ - return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0; -} - -/* safekeeper's epoch is the term of the highest entry in the log */ -static term_t -GetEpoch(Safekeeper *sk) -{ - return GetHighestTerm(&sk->voteResponse.termHistory); -} - -/* If LSN points to the page header, skip it */ -static XLogRecPtr -SkipXLogPageHeader(XLogRecPtr lsn) -{ - if (XLogSegmentOffset(lsn, wal_segment_size) == 0) - { - lsn += SizeOfXLogLongPHD; - } - else if (lsn % XLOG_BLCKSZ == 0) - { - lsn += SizeOfXLogShortPHD; - } - return lsn; -} - -/* - * Called after majority of acceptors gave votes, it calculates the most - * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since - * which we'll write WAL in our term. - * - * Sets truncateLsn along the way (though it is not of much use at this point -- - * only for skipping recovery). - */ -static void -DetermineEpochStartLsn(void) -{ - TermHistory *dth; - - propEpochStartLsn = InvalidXLogRecPtr; - donorEpoch = 0; - truncateLsn = InvalidXLogRecPtr; - timelineStartLsn = InvalidXLogRecPtr; - - for (int i = 0; i < n_safekeepers; i++) - { - if (safekeeper[i].state == SS_IDLE) - { - if (GetEpoch(&safekeeper[i]) > donorEpoch || - (GetEpoch(&safekeeper[i]) == donorEpoch && - safekeeper[i].voteResponse.flushLsn > propEpochStartLsn)) - { - donorEpoch = GetEpoch(&safekeeper[i]); - propEpochStartLsn = safekeeper[i].voteResponse.flushLsn; - donor = i; - } - truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn); - - if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr) - { - /* timelineStartLsn should be the same everywhere or unknown */ - if (timelineStartLsn != InvalidXLogRecPtr && - timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn) - { - elog(WARNING, - "inconsistent timelineStartLsn: current %X/%X, received %X/%X", - LSN_FORMAT_ARGS(timelineStartLsn), - LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn)); - } - timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn; - } - } - } - - /* - * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was - * committed yet. Start streaming then from the basebackup LSN. - */ - if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers) - { - propEpochStartLsn = truncateLsn = GetRedoStartLsn(); - if (timelineStartLsn == InvalidXLogRecPtr) - { - timelineStartLsn = GetRedoStartLsn(); - } - elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn)); - } - - /* - * If propEpochStartLsn is not 0, at least one msg with WAL was sent to - * some connected safekeeper; it must have carried truncateLsn pointing to - * the first record. - */ - Assert((truncateLsn != InvalidXLogRecPtr) || - (syncSafekeepers && truncateLsn == propEpochStartLsn)); - - /* - * We will be generating WAL since propEpochStartLsn, so we should set - * availableLsn to mark this LSN as the latest available position. - */ - availableLsn = propEpochStartLsn; - - /* - * Proposer's term history is the donor's + its own entry. - */ - dth = &safekeeper[donor].voteResponse.termHistory; - propTermHistory.n_entries = dth->n_entries + 1; - propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries); - memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries); - propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm; - propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn; - - elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - quorum, - propTerm, - LSN_FORMAT_ARGS(propEpochStartLsn), - safekeeper[donor].host, safekeeper[donor].port, - LSN_FORMAT_ARGS(truncateLsn) - ); - - /* - * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since - * which we are going to write according to the consensus. If not, we must - * bail out, as clog and other non rel data is inconsistent. - */ - if (!syncSafekeepers) - { - /* - * Basebackup LSN always points to the beginning of the record (not the - * page), as StartupXLOG most probably wants it this way. Safekeepers - * don't skip header as they need continious stream of data, so - * correct LSN for comparison. - */ - if (SkipXLogPageHeader(propEpochStartLsn) != GetRedoStartLsn()) - { - /* - * However, allow to proceed if previously elected leader was me; plain - * restart of walproposer not intervened by concurrent compute (who could - * generate WAL) is ok. - */ - if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == - walprop_shared->mineLastElectedTerm))) - { - elog(PANIC, - "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", - LSN_FORMAT_ARGS(propEpochStartLsn), - LSN_FORMAT_ARGS(GetRedoStartLsn())); - } - } - walprop_shared->mineLastElectedTerm = propTerm; - } -} - -/* - * Receive WAL from most advanced safekeeper - */ -static bool -WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos) -{ - char conninfo[MAXCONNINFO]; - char *err; - WalReceiverConn *wrconn; - WalRcvStreamOptions options; - - sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'", - safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer); - wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); - if (!wrconn) - { - ereport(WARNING, - (errmsg("could not connect to WAL acceptor %s:%s: %s", - safekeeper[donor].host, safekeeper[donor].port, - err))); - return false; - } - elog(LOG, - "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); - - options.logical = false; - options.startpoint = startpos; - options.slotname = NULL; - options.proto.physical.startpointTLI = timeline; - - if (walrcv_startstreaming(wrconn, &options)) - { - XLogRecPtr rec_start_lsn; - XLogRecPtr rec_end_lsn = 0; - int len; - char *buf; - pgsocket wait_fd = PGINVALID_SOCKET; - - while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) - { - if (len == 0) - { - (void) WaitLatchOrSocket( - MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, - -1, WAIT_EVENT_WAL_RECEIVER_MAIN); - } - else - { - Assert(buf[0] == 'w' || buf[0] == 'k'); - if (buf[0] == 'k') - continue; /* keepalive */ - memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], - sizeof rec_start_lsn); - rec_start_lsn = pg_ntoh64(rec_start_lsn); - rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; - - /* write WAL to disk */ - XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); - - ereport(DEBUG1, - (errmsg("Recover message %X/%X length %d", - LSN_FORMAT_ARGS(rec_start_lsn), len))); - if (rec_end_lsn >= endpos) - break; - } - } - ereport(LOG, - (errmsg("end of replication stream at %X/%X: %m", - LSN_FORMAT_ARGS(rec_end_lsn)))); - walrcv_disconnect(wrconn); - - /* failed to receive all WAL till endpos */ - if (rec_end_lsn < endpos) - return false; - } - else - { - ereport(LOG, - (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", - timeline, (uint32) (startpos >> 32), (uint32) startpos))); - return false; - } - - return true; -} - -/* - * Determine for sk the starting streaming point and send it message - * 1) Announcing we are elected proposer (which immediately advances epoch if - * safekeeper is synced, being important for sync-safekeepers) - * 2) Communicating starting streaming point -- safekeeper must truncate its WAL - * beyond it -- and history of term switching. - * - * Sets sk->startStreamingAt. - */ -static void -SendProposerElected(Safekeeper *sk) -{ - ProposerElected msg; - TermHistory *th; - term_t lastCommonTerm; - int i; - - /* - * Determine start LSN by comparing safekeeper's log term switch history and - * proposer's, searching for the divergence point. - * - * Note: there is a vanishingly small chance of no common point even if - * there is some WAL on safekeeper, if immediately after bootstrap compute - * wrote some WAL on single sk and died; we stream since the beginning then. - */ - th = &sk->voteResponse.termHistory; - /* - * If any WAL is present on the sk, it must be authorized by some term. - * OTOH, without any WAL there are no term swiches in the log. - */ - Assert((th->n_entries == 0) == - (sk->voteResponse.flushLsn == InvalidXLogRecPtr)); - /* We must start somewhere. */ - Assert(propTermHistory.n_entries >= 1); - - for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++) - { - if (propTermHistory.entries[i].term != th->entries[i].term) - break; - /* term must begin everywhere at the same point */ - Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn); - } - i--; /* step back to the last common term */ - if (i < 0) - { - /* safekeeper is empty or no common point, start from the beginning */ - sk->startStreamingAt = propTermHistory.entries[0].lsn; - - if (sk->startStreamingAt < truncateLsn) - { - /* - * There's a gap between the WAL starting point and a truncateLsn, - * which can't appear in a normal working cluster. That gap means - * that all safekeepers reported that they have persisted WAL up - * to the truncateLsn before, but now current safekeeper tells - * otherwise. - * - * Also we have a special condition here, which is empty safekeeper - * with no history. In combination with a gap, that can happen when - * we introduce a new safekeeper to the cluster. This is a rare case, - * which is triggered manually for now, and should be treated with - * care. - */ - - /* - * truncateLsn will not change without ack from current safekeeper, - * and it's aligned to the WAL record, so we can safely start - * streaming from this point. - */ - sk->startStreamingAt = truncateLsn; - - elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X", - sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn), - LSN_FORMAT_ARGS(sk->startStreamingAt)); - } - } - else - { - /* - * End of (common) term is the start of the next except it is the last - * one; there it is flush_lsn in case of safekeeper or, in case of - * proposer, LSN it is currently writing, but then we just pick - * safekeeper pos as it obviously can't be higher. - */ - if (propTermHistory.entries[i].term == propTerm) - { - sk->startStreamingAt = sk->voteResponse.flushLsn; - } - else - { - XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn; - XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn : - sk->voteResponse.flushLsn); - sk->startStreamingAt = Min(propEndLsn, skEndLsn); - } - } - - Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn); - - msg.tag = 'e'; - msg.term = propTerm; - msg.startStreamingAt = sk->startStreamingAt; - msg.termHistory = &propTermHistory; - msg.timelineStartLsn = timelineStartLsn; - - lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0; - elog(LOG, - "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X", - sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn)); - - resetStringInfo(&sk->outbuf); - pq_sendint64_le(&sk->outbuf, msg.tag); - pq_sendint64_le(&sk->outbuf, msg.term); - pq_sendint64_le(&sk->outbuf, msg.startStreamingAt); - pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries); - for (int i = 0; i < msg.termHistory->n_entries; i++) - { - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term); - pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn); - } - pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn); - - if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH)) - return; - - StartStreaming(sk); -} - -/* - * Start walsender streaming replication - */ -static void -WalProposerStartStreaming(XLogRecPtr startpos) -{ - StartReplicationCmd cmd; - - elog(LOG, "WAL proposer starts streaming at %X/%X", - LSN_FORMAT_ARGS(startpos)); - cmd.slotname = WAL_PROPOSER_SLOT_NAME; - cmd.timeline = greetRequest.timeline; - cmd.startpoint = startpos; - StartReplication(&cmd); -} - -/* - * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets - * correct event set. - */ -static void -StartStreaming(Safekeeper *sk) -{ - /* - * This is the only entrypoint to state SS_ACTIVE. It's executed - * exactly once for a connection. - */ - sk->state = SS_ACTIVE; - sk->streamingAt = sk->startStreamingAt; - - /* event set will be updated inside SendMessageToNode */ - SendMessageToNode(sk); -} - -/* - * Try to send message to the particular node. Always updates event set. Will - * send at least one message, if socket is ready. - * - * Can be used only for safekeepers in SS_ACTIVE state. State can be changed - * in case of errors. - */ -static void -SendMessageToNode(Safekeeper *sk) -{ - Assert(sk->state == SS_ACTIVE); - - /* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */ - HandleActiveState(sk, WL_SOCKET_WRITEABLE); -} - -/* - * Broadcast new message to all caught-up safekeepers - */ -static void -BroadcastAppendRequest() -{ - for (int i = 0; i < n_safekeepers; i++) - if (safekeeper[i].state == SS_ACTIVE) - SendMessageToNode(&safekeeper[i]); -} - -static void -PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn) -{ - Assert(endLsn >= beginLsn); - req->tag = 'a'; - req->term = propTerm; - req->epochStartLsn = propEpochStartLsn; - req->beginLsn = beginLsn; - req->endLsn = endLsn; - req->commitLsn = GetAcknowledgedByQuorumWALPosition(); - req->truncateLsn = truncateLsn; - req->proposerId = greetRequest.proposerId; -} - -/* - * Process all events happened in SS_ACTIVE state, update event set after that. - */ -static void -HandleActiveState(Safekeeper *sk, uint32 events) -{ - uint32 newEvents = WL_SOCKET_READABLE; - - if (events & WL_SOCKET_WRITEABLE) - if (!SendAppendRequests(sk)) - return; - - if (events & WL_SOCKET_READABLE) - if (!RecvAppendResponses(sk)) - return; - - /* - * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data - * in the buffer. - * - * LSN comparison checks if we have pending unsent messages. This check isn't - * necessary now, because we always send append messages immediately after - * arrival. But it's good to have it here in case we change this behavior - * in the future. - */ - if (sk->streamingAt != availableLsn || sk->flushWrite) - newEvents |= WL_SOCKET_WRITEABLE; - - UpdateEventSet(sk, newEvents); -} - -/* - * Send WAL messages starting from sk->streamingAt until the end or non-writable - * socket, whichever comes first. Caller should take care of updating event set. - * Even if no unsent WAL is available, at least one empty message will be sent - * as a heartbeat, if socket is ready. - * - * Can change state if Async* functions encounter errors and reset connection. - * Returns false in this case, true otherwise. - */ -static bool -SendAppendRequests(Safekeeper *sk) -{ - XLogRecPtr endLsn; - AppendRequestHeader *req; - PGAsyncWriteResult writeResult; - WALReadError errinfo; - bool sentAnything = false; - // FIXME Is it ok to use hardcoded value here? - TimeLineID tli = 1; - - if (sk->flushWrite) - { - if (!AsyncFlush(sk)) - /* - * AsyncFlush failed, that could happen if the socket is closed or - * we have nothing to write and should wait for writeable socket. - */ - return sk->state == SS_ACTIVE; - - /* Event set will be updated in the end of HandleActiveState */ - sk->flushWrite = false; - } - - while (sk->streamingAt != availableLsn || !sentAnything) - { - sentAnything = true; - - endLsn = sk->streamingAt; - endLsn += MAX_SEND_SIZE; - - /* if we went beyond available WAL, back off */ - if (endLsn > availableLsn) { - endLsn = availableLsn; - } - - req = &sk->appendRequest; - PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn); - - ereport(DEBUG2, - (errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s", - req->endLsn - req->beginLsn, - LSN_FORMAT_ARGS(req->beginLsn), - LSN_FORMAT_ARGS(req->endLsn), - LSN_FORMAT_ARGS(req->commitLsn), - LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port))); - - resetStringInfo(&sk->outbuf); - - /* write AppendRequest header */ - appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader)); - - /* write the WAL itself */ - enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn); - if (!WALRead(sk->xlogreader, - &sk->outbuf.data[sk->outbuf.len], - req->beginLsn, - req->endLsn - req->beginLsn, - tli, - &errinfo)) - { - WALReadRaiseError(&errinfo); - } - sk->outbuf.len += req->endLsn - req->beginLsn; - - writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len); - - /* Mark current message as sent, whatever the result is */ - sk->streamingAt = endLsn; - - switch (writeResult) - { - case PG_ASYNC_WRITE_SUCCESS: - /* Continue writing the next message */ - break; - - case PG_ASYNC_WRITE_TRY_FLUSH: - /* - * We still need to call PQflush some more to finish the job. - * Caller function will handle this by setting right event set. - */ - sk->flushWrite = true; - return true; - - case PG_ASYNC_WRITE_FAIL: - elog(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); - ShutdownConnection(sk); - return false; - default: - Assert(false); - return false; - } - } - - return true; -} - -/* - * Receive and process all available feedback. - * - * Can change state if Async* functions encounter errors and reset connection. - * Returns false in this case, true otherwise. - * - * NB: This function can call SendMessageToNode and produce new messages. - */ -static bool -RecvAppendResponses(Safekeeper *sk) -{ - XLogRecPtr minQuorumLsn; - bool readAnything = false; - - while (true) - { - /* - * If our reading doesn't immediately succeed, any - * necessary error handling or state setting is taken care - * of. We can leave any other work until later. - */ - sk->appendResponse.apm.tag = 'a'; - if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse)) - break; - - ereport(DEBUG2, - (errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s", - sk->appendResponse.term, - LSN_FORMAT_ARGS(sk->appendResponse.flushLsn), - LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), - sk->host, sk->port))); - - if (sk->appendResponse.term > propTerm) - { - /* Another compute with higher term is running. */ - elog(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", - sk->host, sk->port, - sk->appendResponse.term, propTerm); - } - - readAnything = true; - } - - if (!readAnything) - return sk->state == SS_ACTIVE; - - HandleSafekeeperResponse(); - - /* - * Also send the new commit lsn to all the safekeepers. - */ - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); - if (minQuorumLsn > lastSentCommitLsn) - { - BroadcastAppendRequest(); - lastSentCommitLsn = minQuorumLsn; - } - - return sk->state == SS_ACTIVE; -} - -/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ -void -ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *rf) -{ - uint8 nkeys; - int i; - int32 len; - - /* get number of custom keys */ - nkeys = pq_getmsgbyte(reply_message); - - for (i = 0; i < nkeys; i++) - { - const char *key = pq_getmsgstring(reply_message); - if (strcmp(key, "current_timeline_size") == 0) - { - pq_getmsgint(reply_message, sizeof(int32)); // read value length - rf->currentClusterSize = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); - } - else if (strcmp(key, "ps_writelsn") == 0) - { - pq_getmsgint(reply_message, sizeof(int32)); // read value length - rf->ps_writelsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_writelsn)); - } - else if (strcmp(key, "ps_flushlsn") == 0) - { - pq_getmsgint(reply_message, sizeof(int32)); // read value length - rf->ps_flushlsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_flushlsn)); - } - else if (strcmp(key, "ps_applylsn") == 0) - { - pq_getmsgint(reply_message, sizeof(int32)); // read value length - rf->ps_applylsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_applylsn)); - } - else if (strcmp(key, "ps_replytime") == 0) - { - pq_getmsgint(reply_message, sizeof(int32)); // read value length - rf->ps_replytime = pq_getmsgint64(reply_message); - { - char *replyTimeStr; - - /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", - rf->ps_replytime, replyTimeStr); - - pfree(replyTimeStr); - } - } - else - { - len = pq_getmsgint(reply_message, sizeof(int32)); // read value length - // Skip unknown keys to support backward compatibile protocol changes - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); - pq_getmsgbytes(reply_message, len); - }; - } -} - -/* - * Combine hot standby feedbacks from all safekeepers. - */ -static void -CombineHotStanbyFeedbacks(HotStandbyFeedback * hs) -{ - hs->ts = 0; - hs->xmin.value = ~0; /* largest unsigned value */ - hs->catalog_xmin.value = ~0; /* largest unsigned value */ - - for (int i = 0; i < n_safekeepers; i++) - { - if (safekeeper[i].appendResponse.hs.ts != 0) - { - if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin)) - { - hs->xmin = safekeeper[i].appendResponse.hs.xmin; - hs->ts = safekeeper[i].appendResponse.hs.ts; - } - if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin)) - { - hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin; - hs->ts = safekeeper[i].appendResponse.hs.ts; - } - } - } -} - - -/* - * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the - * last WAL record that can be safely discarded. - */ -static XLogRecPtr -CalculateMinFlushLsn(void) -{ - XLogRecPtr lsn = UnknownXLogRecPtr; - for (int i = 0; i < n_safekeepers; i++) - { - if (safekeeper[i].appendResponse.flushLsn < lsn) - lsn = safekeeper[i].appendResponse.flushLsn; - } - return lsn; -} - -/* - * Calculate WAL position acknowledged by quorum - */ -static XLogRecPtr -GetAcknowledgedByQuorumWALPosition(void) -{ - XLogRecPtr responses[MAX_SAFEKEEPERS]; - - /* - * Sort acknowledged LSNs - */ - for (int i = 0; i < n_safekeepers; i++) - { - /* - * Like in Raft, we aren't allowed to commit entries from previous - * terms, so ignore reported LSN until it gets to epochStartLsn. - */ - responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ? - safekeeper[i].appendResponse.flushLsn : 0; - } - qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn); - - /* - * Get the smallest LSN committed by quorum - */ - return responses[n_safekeepers - quorum]; -} - -/* - * ReplicationFeedbackShmemSize --- report amount of shared memory space needed - */ -Size -WalproposerShmemSize(void) -{ - return sizeof(WalproposerShmemState); -} - -bool -WalproposerShmemInit(void) -{ - bool found; - - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - walprop_shared = ShmemInitStruct("Walproposer shared state", - sizeof(WalproposerShmemState), - &found); - - if (!found) - { - memset(walprop_shared, 0, WalproposerShmemSize()); - SpinLockInit(&walprop_shared->mutex); - } - LWLockRelease(AddinShmemInitLock); - - return found; -} - -void -replication_feedback_set(ReplicationFeedback *rf) -{ - SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); - SpinLockRelease(&walprop_shared->mutex); -} - - -void -replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) -{ - SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.ps_writelsn; - *flushLsn = walprop_shared->feedback.ps_flushlsn; - *applyLsn = walprop_shared->feedback.ps_applylsn; - SpinLockRelease(&walprop_shared->mutex); -} - - -/* - * Get ReplicationFeedback fields from the most advanced safekeeper - */ -static void -GetLatestZentihFeedback(ReplicationFeedback *rf) -{ - int latest_safekeeper = 0; - XLogRecPtr ps_writelsn = InvalidXLogRecPtr; - for (int i = 0; i < n_safekeepers; i++) - { - if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) - { - latest_safekeeper = i; - ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; - } - } - - rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; - rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; - rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; - rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; - - elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu," - " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->ps_writelsn), - LSN_FORMAT_ARGS(rf->ps_flushlsn), - LSN_FORMAT_ARGS(rf->ps_applylsn), - rf->ps_replytime); - - replication_feedback_set(rf); -} - -static void -HandleSafekeeperResponse(void) -{ - HotStandbyFeedback hsFeedback; - XLogRecPtr minQuorumLsn; - XLogRecPtr diskConsistentLsn; - XLogRecPtr minFlushLsn; - - - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); - diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; - - if (!syncSafekeepers) - { - // Get ReplicationFeedback fields from the most advanced safekeeper - GetLatestZentihFeedback(&quorumFeedback.rf); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - } - - if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) - { - - if (minQuorumLsn > quorumFeedback.flushLsn) - quorumFeedback.flushLsn = minQuorumLsn; - - /* advance the replication slot */ - if (!syncSafekeepers) - ProcessStandbyReply( - // write_lsn - This is what durably stored in WAL service. - quorumFeedback.flushLsn, - //flush_lsn - This is what durably stored in WAL service. - quorumFeedback.flushLsn, - //apply_lsn - This is what processed and durably saved at pageserver. - quorumFeedback.rf.ps_flushlsn, - GetCurrentTimestamp(), false); - } - - CombineHotStanbyFeedbacks(&hsFeedback); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) - { - quorumFeedback.hs = hsFeedback; - if (!syncSafekeepers) - ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); - } - - /* - * Try to advance truncateLsn to minFlushLsn, which is the last record - * flushed to all safekeepers. We must always start streaming from the - * beginning of the record, which simplifies decoding on the far end. - * - * Advanced truncateLsn should be not further than nearest commitLsn. - * This prevents surprising violation of truncateLsn <= commitLsn - * invariant which might occur because 1) truncateLsn can be advanced - * immediately once chunk is broadcast to all safekeepers, and - * commitLsn generally can't be advanced based on feedback from - * safekeeper who is still in the previous epoch (similar to 'leader - * can't commit entries from previous term' in Raft); 2) chunks we - * read from WAL and send are plain sheets of bytes, but safekeepers - * ack only on record boundaries. - */ - minFlushLsn = CalculateMinFlushLsn(); - if (minFlushLsn > truncateLsn) - { - truncateLsn = minFlushLsn; - - /* - * Advance the replication slot to free up old WAL files. Note - * that slot doesn't exist if we are in syncSafekeepers mode. - */ - if (MyReplicationSlot) - PhysicalConfirmReceivedLocation(truncateLsn); - } - - /* - * Generally sync is done when majority switched the epoch so we committed - * epochStartLsn and made the majority aware of it, ensuring they are - * ready to give all WAL to pageserver. It would mean whichever majority - * is alive, there will be at least one safekeeper who is able to stream - * WAL to pageserver to make basebackup possible. However, since at the - * moment we don't have any good mechanism of defining the healthy and - * most advanced safekeeper who should push the wal into pageserver and - * basically the random one gets connected, to prevent hanging basebackup - * (due to pageserver connecting to not-synced-safekeeper) we currently - * wait for all seemingly alive safekeepers to get synced. - */ - if (syncSafekeepers) - { - int n_synced; - - n_synced = 0; - for (int i = 0; i < n_safekeepers; i++) - { - Safekeeper *sk = &safekeeper[i]; - bool synced = sk->appendResponse.commitLsn >= propEpochStartLsn; - - /* alive safekeeper which is not synced yet; wait for it */ - if (sk->state != SS_OFFLINE && !synced) - return; - if (synced) - n_synced++; - } - if (n_synced >= quorum) - { - /* All safekeepers synced! */ - fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn)); - exit(0); - } - } -} - -/* - * Try to read CopyData message from i'th safekeeper, resetting connection on - * failure. - */ -static bool -AsyncRead(Safekeeper *sk, char **buf, int *buf_size) -{ - switch (walprop_async_read(sk->conn, buf, buf_size)) - { - case PG_ASYNC_READ_SUCCESS: - return true; - - case PG_ASYNC_READ_TRY_AGAIN: - /* WL_SOCKET_READABLE is always set during copyboth */ - return false; - - case PG_ASYNC_READ_FAIL: - elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host, - sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); - ShutdownConnection(sk); - return false; - } - Assert(false); - return false; -} - -/* - * Read next message with known type into provided struct, by reading a CopyData - * block from the safekeeper's postgres connection, returning whether the read - * was successful. - * - * If the read needs more polling, we return 'false' and keep the state - * unmodified, waiting until it becomes read-ready to try again. If it fully - * failed, a warning is emitted and the connection is reset. - */ -static bool -AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) -{ - char *buf; - int buf_size; - uint64 tag; - StringInfoData s; - - if (!(AsyncRead(sk, &buf, &buf_size))) - return false; - - /* parse it */ - s.data = buf; - s.len = buf_size; - s.cursor = 0; - - tag = pq_getmsgint64_le(&s); - if (tag != anymsg->tag) - { - elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host, - sk->port, FormatSafekeeperState(sk->state)); - ResetConnection(sk); - return false; - } - - switch (tag) - { - case 'g': - { - AcceptorGreeting *msg = (AcceptorGreeting *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->nodeId = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } - - case 'v': - { - VoteResponse *msg = (VoteResponse *) anymsg; - - msg->term = pq_getmsgint64_le(&s); - msg->voteGiven = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->truncateLsn = pq_getmsgint64_le(&s); - msg->termHistory.n_entries = pq_getmsgint32_le(&s); - msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries); - for (int i = 0; i < msg->termHistory.n_entries; i++) - { - msg->termHistory.entries[i].term = pq_getmsgint64_le(&s); - msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s); - } - msg->timelineStartLsn = pq_getmsgint64_le(&s); - pq_getmsgend(&s); - return true; - } - - case 'a': - { - AppendResponse *msg = (AppendResponse *) anymsg; - msg->term = pq_getmsgint64_le(&s); - msg->flushLsn = pq_getmsgint64_le(&s); - msg->commitLsn = pq_getmsgint64_le(&s); - msg->hs.ts = pq_getmsgint64_le(&s); - msg->hs.xmin.value = pq_getmsgint64_le(&s); - msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParseReplicationFeedbackMessage(&s, &msg->rf); - pq_getmsgend(&s); - return true; - } - - default: - { - Assert(false); - return false; - } - } -} - -/* - * Blocking equivalent to AsyncWrite. - * - * We use this everywhere messages are small enough that they should fit in a - * single packet. - */ -static bool -BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state) -{ - uint32 events; - - if (!walprop_blocking_write(sk->conn, msg, msg_size)) - { - elog(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); - ShutdownConnection(sk); - return false; - } - - sk->state = success_state; - - /* - * If the new state will be waiting for events to happen, update the event - * set to wait for those - */ - events = SafekeeperStateDesiredEvents(success_state); - if (events) - UpdateEventSet(sk, events); - - return true; -} - -/* - * Starts a write into the 'i'th safekeeper's postgres connection, moving to - * flush_state (adjusting eventset) if write still needs flushing. - * - * Returns false if sending is unfinished (requires flushing or conn failed). - * Upon failure, a warning is emitted and the connection is reset. - */ -static bool -AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state) -{ - switch (walprop_async_write(sk->conn, msg, msg_size)) - { - case PG_ASYNC_WRITE_SUCCESS: - return true; - case PG_ASYNC_WRITE_TRY_FLUSH: - - /* - * We still need to call PQflush some more to finish the job; go - * to the appropriate state. Update the event set at the bottom of - * this function - */ - sk->state = flush_state; - UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); - return false; - case PG_ASYNC_WRITE_FAIL: - elog(WARNING, "Failed to send to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); - ShutdownConnection(sk); - return false; - default: - Assert(false); - return false; - } -} - -/* - * Flushes a previous call to AsyncWrite. This only needs to be called when the - * socket becomes read or write ready *after* calling AsyncWrite. - * - * If flushing successfully completes returns true, otherwise false. Event set - * is updated only if connection fails, otherwise caller should manually unset - * WL_SOCKET_WRITEABLE. - */ -static bool -AsyncFlush(Safekeeper *sk) -{ - /*--- - * PQflush returns: - * 0 if successful [we're good to move on] - * 1 if unable to send everything yet [call PQflush again] - * -1 if it failed [emit an error] - */ - switch (walprop_flush(sk->conn)) - { - case 0: - /* flush is done */ - return true; - case 1: - /* Nothing to do; try again when the socket's ready */ - return false; - case -1: - elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s", - sk->host, sk->port, FormatSafekeeperState(sk->state), - walprop_error_message(sk->conn)); - ResetConnection(sk); - return false; - default: - Assert(false); - return false; - } -} diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c deleted file mode 100644 index 3ad21feff10..00000000000 --- a/src/backend/replication/walproposer_utils.c +++ /dev/null @@ -1,404 +0,0 @@ -#include "postgres.h" - -#include "replication/walproposer.h" -#include "libpq/pqformat.h" -#include "common/logging.h" -#include "common/ip.h" -#include "../interfaces/libpq/libpq-fe.h" -#include -#include - -/* - * These variables are used similarly to openLogFile/SegNo, - * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID - * corresponding the filename of walpropFile. - */ -static int walpropFile = -1; -static TimeLineID walpropFileTLI = 0; -static XLogSegNo walpropSegNo = 0; - -int -CompareLsn(const void *a, const void *b) -{ - XLogRecPtr lsn1 = *((const XLogRecPtr *) a); - XLogRecPtr lsn2 = *((const XLogRecPtr *) b); - - if (lsn1 < lsn2) - return -1; - else if (lsn1 == lsn2) - return 0; - else - return 1; -} - -/* Returns a human-readable string corresonding to the SafekeeperState - * - * The string should not be freed. - * - * The strings are intended to be used as a prefix to "state", e.g.: - * - * elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state)); - * - * If this sort of phrasing doesn't fit the message, instead use something like: - * - * elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state)); - */ -char* -FormatSafekeeperState(SafekeeperState state) -{ - char* return_val = NULL; - - switch (state) - { - case SS_OFFLINE: - return_val = "offline"; - break; - case SS_CONNECTING_READ: - case SS_CONNECTING_WRITE: - return_val = "connecting"; - break; - case SS_WAIT_EXEC_RESULT: - return_val = "receiving query result"; - break; - case SS_HANDSHAKE_RECV: - return_val = "handshake (receiving)"; - break; - case SS_VOTING: - return_val = "voting"; - break; - case SS_WAIT_VERDICT: - return_val = "wait-for-verdict"; - break; - case SS_SEND_ELECTED_FLUSH: - return_val = "send-announcement-flush"; - break; - case SS_IDLE: - return_val = "idle"; - break; - case SS_ACTIVE: - return_val = "active"; - break; - } - - Assert(return_val != NULL); - - return return_val; -} - -/* Asserts that the provided events are expected for given safekeeper's state */ -void -AssertEventsOkForState(uint32 events, Safekeeper* sk) -{ - uint32 expected = SafekeeperStateDesiredEvents(sk->state); - - /* The events are in-line with what we're expecting, under two conditions: - * (a) if we aren't expecting anything, `events` has no read- or - * write-ready component. - * (b) if we are expecting something, there's overlap - * (i.e. `events & expected != 0`) - */ - bool events_ok_for_state; /* long name so the `Assert` is more clear later */ - - if (expected == WL_NO_EVENTS) - events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0); - else - events_ok_for_state = ((events & expected) != 0); - - if (!events_ok_for_state) - { - /* To give a descriptive message in the case of failure, we use elog and - * then an assertion that's guaranteed to fail. */ - elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]", - FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state)); - Assert(events_ok_for_state); - } -} - -/* Returns the set of events a safekeeper in this state should be waiting on - * - * This will return WL_NO_EVENTS (= 0) for some events. */ -uint32 -SafekeeperStateDesiredEvents(SafekeeperState state) -{ - uint32 result = WL_NO_EVENTS; - - /* If the state doesn't have a modifier, we can check the base state */ - switch (state) - { - /* Connecting states say what they want in the name */ - case SS_CONNECTING_READ: - result = WL_SOCKET_READABLE; - break; - case SS_CONNECTING_WRITE: - result = WL_SOCKET_WRITEABLE; - break; - - /* Reading states need the socket to be read-ready to continue */ - case SS_WAIT_EXEC_RESULT: - case SS_HANDSHAKE_RECV: - case SS_WAIT_VERDICT: - result = WL_SOCKET_READABLE; - break; - - /* Idle states use read-readiness as a sign that the connection has been - * disconnected. */ - case SS_VOTING: - case SS_IDLE: - result = WL_SOCKET_READABLE; - break; - - /* - * Flush states require write-ready for flushing. - * Active state does both reading and writing. - * - * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should - * check sk->flushWrite here to set WL_SOCKET_WRITEABLE. - */ - case SS_SEND_ELECTED_FLUSH: - case SS_ACTIVE: - result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE; - break; - - /* The offline state expects no events. */ - case SS_OFFLINE: - result = WL_NO_EVENTS; - break; - - default: - Assert(false); - break; - } - - return result; -} - -/* Returns a human-readable string corresponding to the event set - * - * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the - * returned string may be meaingless. - * - * The string should not be freed. It should also not be expected to remain the same between - * function calls. */ -char* -FormatEvents(uint32 events) -{ - static char return_str[8]; - - /* Helper variable to check if there's extra bits */ - uint32 all_flags = WL_LATCH_SET - | WL_SOCKET_READABLE - | WL_SOCKET_WRITEABLE - | WL_TIMEOUT - | WL_POSTMASTER_DEATH - | WL_EXIT_ON_PM_DEATH - | WL_SOCKET_CONNECTED; - - /* The formatting here isn't supposed to be *particularly* useful -- it's just to give an - * sense of what events have been triggered without needing to remember your powers of two. */ - - return_str[0] = (events & WL_LATCH_SET ) ? 'L' : '_'; - return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_'; - return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_'; - return_str[3] = (events & WL_TIMEOUT ) ? 'T' : '_'; - return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_'; - return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_'; - return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_'; - - if (events & (~all_flags)) - { - elog(WARNING, "Event formatting found unexpected component %d", - events & (~all_flags)); - return_str[6] = '*'; - return_str[7] = '\0'; - } - else - return_str[6] = '\0'; - - return (char *) &return_str; -} - -/* - * Convert a character which represents a hexadecimal digit to an integer. - * - * Returns -1 if the character is not a hexadecimal digit. - */ -static int -HexDecodeChar(char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 10; - if (c >= 'A' && c <= 'F') - return c - 'A' + 10; - - return -1; -} - -/* - * Decode a hex string into a byte string, 2 hex chars per byte. - * - * Returns false if invalid characters are encountered; otherwise true. - */ -bool -HexDecodeString(uint8 *result, char *input, int nbytes) -{ - int i; - - for (i = 0; i < nbytes; ++i) - { - int n1 = HexDecodeChar(input[i * 2]); - int n2 = HexDecodeChar(input[i * 2 + 1]); - - if (n1 < 0 || n2 < 0) - return false; - result[i] = n1 * 16 + n2; - } - - return true; -} - -/* -------------------------------- - * pq_getmsgint32_le - get a binary 4-byte int from a message buffer in native (LE) order - * -------------------------------- - */ -uint32 -pq_getmsgint32_le(StringInfo msg) -{ - uint32 n32; - - pq_copymsgbytes(msg, (char *) &n32, sizeof(n32)); - - return n32; -} - -/* -------------------------------- - * pq_getmsgint64 - get a binary 8-byte int from a message buffer in native (LE) order - * -------------------------------- - */ -uint64 -pq_getmsgint64_le(StringInfo msg) -{ - uint64 n64; - - pq_copymsgbytes(msg, (char *) &n64, sizeof(n64)); - - return n64; -} - -/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */ -void -pq_sendint32_le(StringInfo buf, uint32 i) -{ - enlargeStringInfo(buf, sizeof(uint32)); - memcpy(buf->data + buf->len, &i, sizeof(uint32)); - buf->len += sizeof(uint32); -} - -/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */ -void -pq_sendint64_le(StringInfo buf, uint64 i) -{ - enlargeStringInfo(buf, sizeof(uint64)); - memcpy(buf->data + buf->len, &i, sizeof(uint64)); - buf->len += sizeof(uint64); -} - -/* - * Write XLOG data to disk. - */ -void -XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr) -{ - int startoff; - int byteswritten; - - while (nbytes > 0) - { - int segbytes; - - /* Close the current segment if it's completed */ - if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) - XLogWalPropClose(recptr); - - if (walpropFile < 0) - { - // FIXME Is it ok to use hardcoded value here? - TimeLineID tli = 1; - /* Create/use new log file */ - XLByteToSeg(recptr, walpropSegNo, wal_segment_size); - // FIXME Is it ok to call XLogFileInit here? - // In neon we want to open existing file. - walpropFile = XLogFileInit(walpropSegNo, tli); - walpropFileTLI = tli; - } - - /* Calculate the start offset of the received logs */ - startoff = XLogSegmentOffset(recptr, wal_segment_size); - - if (startoff + nbytes > wal_segment_size) - segbytes = wal_segment_size - startoff; - else - segbytes = nbytes; - - /* OK to write the logs */ - errno = 0; - - byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); - if (byteswritten <= 0) - { - char xlogfname[MAXFNAMELEN]; - int save_errno; - - /* if write didn't set errno, assume no disk space */ - if (errno == 0) - errno = ENOSPC; - - save_errno = errno; - XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log segment %s " - "at offset %u, length %lu: %m", - xlogfname, startoff, (unsigned long) segbytes))); - } - - /* Update state for write */ - recptr += byteswritten; - - nbytes -= byteswritten; - buf += byteswritten; - } - - /* - * Close the current segment if it's fully written up in the last cycle of - * the loop. - */ - if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) - { - XLogWalPropClose(recptr); - } -} - -/* - * Close the current segment. - */ -void -XLogWalPropClose(XLogRecPtr recptr) -{ - Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); - - if (close(walpropFile) != 0) - { - char xlogfname[MAXFNAMELEN]; - XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); - - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not close log segment %s: %m", - xlogfname))); - } - - walpropFile = -1; -} diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index dd115365a13..667f94e888f 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -75,7 +75,7 @@ #include "replication/slot.h" #include "replication/snapbuild.h" #include "replication/syncrep.h" -#include "replication/walproposer.h" +#include "replication/walpropshim.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "replication/walsender_private.h" @@ -132,6 +132,11 @@ bool log_replication_commands = false; */ bool wake_wal_senders = false; +/* + * Backpressure hook, detecting how much we should delay. + */ +uint64 (*delay_backend_us)(void) = NULL; + /* * xlogreader used for replication. Note that a WAL sender doing physical * replication does not need xlogreader to read WAL, but it needs one to @@ -238,11 +243,10 @@ static void IdentifySystem(void); static void ReadReplicationSlot(ReadReplicationSlotCmd *cmd); static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd); static void DropReplicationSlot(DropReplicationSlotCmd *cmd); -void StartReplication(StartReplicationCmd *cmd); +static void StartReplication(StartReplicationCmd *cmd); static void StartLogicalReplication(StartReplicationCmd *cmd); static void ProcessStandbyMessage(void); static void ProcessStandbyReplyMessage(void); -static void ProcessReplicationFeedbackMessage(void); static void ProcessStandbyHSFeedbackMessage(void); static void ProcessRepliesIfAny(void); static void ProcessPendingWrites(void); @@ -256,8 +260,6 @@ static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, Transac static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool skipped_xact); static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); -static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); -static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch); static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, @@ -684,7 +686,7 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd) * At the moment, this never returns, but an ereport(ERROR) will take us back * to the main loop. */ -void +static void StartReplication(StartReplicationCmd *cmd) { StringInfoData buf; @@ -819,14 +821,11 @@ StartReplication(StartReplicationCmd *cmd) WalSndSetState(WALSNDSTATE_CATCHUP); /* Send a CopyBothResponse message, and start streaming */ - if (!am_wal_proposer) - { - pq_beginmessage(&buf, 'W'); - pq_sendbyte(&buf, 0); - pq_sendint16(&buf, 0); - pq_endmessage(&buf); - pq_flush(); - } + pq_beginmessage(&buf, 'W'); + pq_sendbyte(&buf, 0); + pq_sendint16(&buf, 0); + pq_endmessage(&buf); + pq_flush(); /* * Don't allow a request to stream from a future point in WAL that @@ -1466,7 +1465,7 @@ ProcessPendingWrites(void) } /* Try to flush pending output to the client */ - if (!am_wal_proposer && pq_flush_if_writable() != 0) + if (pq_flush_if_writable() != 0) WalSndShutdown(); } @@ -1909,9 +1908,6 @@ ProcessRepliesIfAny(void) int r; bool received = false; - if (am_wal_proposer) - return; - last_processing = GetCurrentTimestamp(); /* @@ -2037,10 +2033,6 @@ ProcessStandbyMessage(void) ProcessStandbyHSFeedbackMessage(); break; - case 'z': - ProcessReplicationFeedbackMessage(); - break; - default: ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -2113,28 +2105,6 @@ ProcessStandbyReplyMessage(void) LSN_FORMAT_ARGS(applyPtr)); } -// This message is a neon extension of postgres replication protocol -static void -ProcessReplicationFeedbackMessage(void) -{ - ReplicationFeedback rf; - - // consume message length - pq_getmsgint64(&reply_message); - - ParseReplicationFeedbackMessage(&reply_message, &rf); - - replication_feedback_set(&rf); - - SetZenithCurrentClusterSize(rf.currentClusterSize); - - ProcessStandbyReply(rf.ps_writelsn, - rf.ps_flushlsn, - rf.ps_applylsn, - rf.ps_replytime, - false); -} - void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr, @@ -2218,13 +2188,6 @@ ProcessStandbyReply(XLogRecPtr writePtr, if (!am_cascading_walsender) SyncRepReleaseWaiters(); - /* - * walproposer use trunclateLsn instead of flushPtr for confirmed - * received location, so we shouldn't update restart_lsn here. - */ - if (am_wal_proposer) - return; - /* * Advance our local xmin horizon when the client confirmed a flush. */ @@ -2554,19 +2517,6 @@ WalSndLoop(WalSndSendDataCallback send_data) /* Check for input from the client */ ProcessRepliesIfAny(); - if (am_wal_proposer) - { - send_data(); - if (WalSndCaughtUp) - { - if (MyWalSnd->state == WALSNDSTATE_CATCHUP) - WalSndSetState(WALSNDSTATE_STREAMING); - WalProposerPoll(); - WalSndCaughtUp = false; - } - continue; - } - /* * If we have received CopyDone from the client, sent CopyDone * ourselves, and the output buffer is empty, it's time to exit @@ -3024,83 +2974,77 @@ XLogSendPhysical(void) nbytes = endptr - startptr; Assert(nbytes <= MAX_SEND_SIZE); - if (am_wal_proposer) - { - WalProposerBroadcast(startptr, endptr); - } + /* + * OK to read and send the slice. + */ + if (output_message.data) + resetStringInfo(&output_message); else - { - /* - * OK to read and send the slice. - */ - if (output_message.data) - resetStringInfo(&output_message); - else - initStringInfo(&output_message); - - pq_sendbyte(&output_message, 'w'); - pq_sendint64(&output_message, startptr); /* dataStart */ - pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ - pq_sendint64(&output_message, 0); /* sendtime, filled in last */ + initStringInfo(&output_message); + + pq_sendbyte(&output_message, 'w'); + pq_sendint64(&output_message, startptr); /* dataStart */ + pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ + pq_sendint64(&output_message, 0); /* sendtime, filled in last */ + + /* + * Read the log directly into the output buffer to avoid extra memcpy + * calls. + */ + enlargeStringInfo(&output_message, nbytes); + +retry: + if (!WALRead(xlogreader, + &output_message.data[output_message.len], + startptr, + nbytes, + xlogreader->seg.ws_tli, /* Pass the current TLI because + * only WalSndSegmentOpen controls + * whether new TLI is needed. */ + &errinfo)) + WALReadRaiseError(&errinfo); - /* - * Read the log directly into the output buffer to avoid extra memcpy - * calls. - */ - enlargeStringInfo(&output_message, nbytes); - - retry: - if (!WALRead(xlogreader, - &output_message.data[output_message.len], - startptr, - nbytes, - xlogreader->seg.ws_tli, /* Pass the current TLI because - * only WalSndSegmentOpen controls - * whether new TLI is needed. */ - &errinfo)) - WALReadRaiseError(&errinfo); - - /* See logical_read_xlog_page(). */ - XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize); - CheckXLogRemoved(segno, xlogreader->seg.ws_tli); + /* See logical_read_xlog_page(). */ + XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize); + CheckXLogRemoved(segno, xlogreader->seg.ws_tli); - /* - * During recovery, the currently-open WAL file might be replaced with the - * file of the same name retrieved from archive. So we always need to - * check what we read was valid after reading into the buffer. If it's - * invalid, we try to open and read the file again. - */ - if (am_cascading_walsender) - { - WalSnd *walsnd = MyWalSnd; - bool reload; + /* + * During recovery, the currently-open WAL file might be replaced with the + * file of the same name retrieved from archive. So we always need to + * check what we read was valid after reading into the buffer. If it's + * invalid, we try to open and read the file again. + */ + if (am_cascading_walsender) + { + WalSnd *walsnd = MyWalSnd; + bool reload; - SpinLockAcquire(&walsnd->mutex); - reload = walsnd->needreload; - walsnd->needreload = false; - SpinLockRelease(&walsnd->mutex); + SpinLockAcquire(&walsnd->mutex); + reload = walsnd->needreload; + walsnd->needreload = false; + SpinLockRelease(&walsnd->mutex); - if (reload && xlogreader->seg.ws_file >= 0) - { - wal_segment_close(xlogreader); + if (reload && xlogreader->seg.ws_file >= 0) + { + wal_segment_close(xlogreader); - goto retry; - } + goto retry; } + } - output_message.len += nbytes; - output_message.data[output_message.len] = '\0'; + output_message.len += nbytes; + output_message.data[output_message.len] = '\0'; - /* - * Fill the send timestamp last, so that it is taken as late as possible. - */ - resetStringInfo(&tmpbuf); - pq_sendint64(&tmpbuf, GetCurrentTimestamp()); - memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], - tmpbuf.data, sizeof(int64)); + /* + * Fill the send timestamp last, so that it is taken as late as possible. + */ + resetStringInfo(&tmpbuf); + pq_sendint64(&tmpbuf, GetCurrentTimestamp()); + memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], + tmpbuf.data, sizeof(int64)); + + pq_putmessage_noblock('d', output_message.data, output_message.len); - pq_putmessage_noblock('d', output_message.data, output_message.len); - } sentPtr = endptr; /* Update shared memory status */ @@ -3793,7 +3737,7 @@ WalSndKeepaliveIfNecessary(void) * eventually reported to have been written, flushed and applied by the * standby in a reply message. */ -static void +void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) { bool buffer_full; @@ -3858,7 +3802,7 @@ LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) * Return -1 if no new sample data is available, and otherwise the elapsed * time in microseconds. */ -static TimeOffset +TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now) { TimestampTz time = 0; @@ -3955,79 +3899,3 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now) Assert(time != 0); return now - time; } - -/* - * Get minimal write and flush LSN among all live replicas - */ -void -GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply_lsn) -{ - XLogRecPtr min_write_lsn = UnknownXLogRecPtr; - XLogRecPtr min_flush_lsn = UnknownXLogRecPtr; - XLogRecPtr min_apply_lsn = UnknownXLogRecPtr; - for (int i = 0; i < max_wal_senders; i++) - { - WalSnd *walsnd = &WalSndCtl->walsnds[i]; - if (walsnd->state == WALSNDSTATE_STREAMING) - { - /* - * We assume that reads from walsnd->write/flush are atomic - * on all modern x64 systems, as these fields are uint64 and - * should be 8-bytes aligned. - */ - XLogRecPtr written = walsnd->write; - XLogRecPtr flushed = walsnd->flush; - XLogRecPtr applied = walsnd->apply; - min_write_lsn = Min(written, min_write_lsn); - min_flush_lsn = Min(flushed, min_flush_lsn); - min_apply_lsn = Min(applied, min_apply_lsn); - } - } - *write_lsn = min_write_lsn; - *flush_lsn = min_flush_lsn; - *apply_lsn = min_apply_lsn; -} - -// Check if we need to suspend inserts because of lagging replication. -uint64 -backpressure_lag(void) -{ - if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0) - { - XLogRecPtr writePtr; - XLogRecPtr flushPtr; - XLogRecPtr applyPtr; - XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL); - - replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); - #define MB ((XLogRecPtr)1024*1024) - - elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", - LSN_FORMAT_ARGS(myFlushLsn), - LSN_FORMAT_ARGS(writePtr), - LSN_FORMAT_ARGS(flushPtr), - LSN_FORMAT_ARGS(applyPtr)); - - if ((writePtr != UnknownXLogRecPtr - && max_replication_write_lag > 0 - && myFlushLsn > writePtr + max_replication_write_lag*MB)) - { - return (myFlushLsn - writePtr - max_replication_write_lag*MB); - } - - if ((flushPtr != UnknownXLogRecPtr - && max_replication_flush_lag > 0 - && myFlushLsn > flushPtr + max_replication_flush_lag*MB)) - { - return (myFlushLsn - flushPtr - max_replication_flush_lag*MB); - } - - if ((applyPtr != UnknownXLogRecPtr - && max_replication_apply_lag > 0 - && myFlushLsn > applyPtr + max_replication_apply_lag*MB)) - { - return (myFlushLsn - applyPtr - max_replication_apply_lag*MB); - } - } - return 0; -} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 808f3426851..464196d803c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -806,8 +806,7 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, bool hit; SMgrRelation smgr = smgropen(rnode, InvalidBackendId, - permanent ? RELPERSISTENCE_PERMANENT : - RELPERSISTENCE_UNLOGGED); + RELPERSISTENCE_PERMANENT); return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, forkNum, blockNum, diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 54bb3cdbcae..1a6f5270518 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -36,7 +36,6 @@ #include "replication/slot.h" #include "replication/walreceiver.h" #include "replication/walsender.h" -#include "replication/walproposer.h" #include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/ipc.h" @@ -142,7 +141,6 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); size = add_size(size, StatsShmemSize()); - size = add_size(size, WalproposerShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -296,8 +294,6 @@ CreateSharedMemoryAndSemaphores(void) AsyncShmemInit(); StatsShmemInit(); - WalproposerShmemInit(); - #ifdef EXEC_BACKEND /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 20e4d4b0a85..dff17874c2c 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3426,15 +3426,20 @@ ProcessInterrupts(void) { ProcessInterrupts_pg(); - // Suspend writers until replicas catch up - lag = backpressure_lag(); - if (lag <= 0) - break; + if (delay_backend_us != NULL) + { + // Suspend writers until replicas catch up + lag = delay_backend_us(); + if (lag <= 0) + break; - set_ps_display("backpressure throttling"); + set_ps_display("backpressure throttling"); - elog(DEBUG2, "backpressure throttling: lag %lu", lag); - pg_usleep(BACK_PRESSURE_DELAY); + elog(DEBUG2, "backpressure throttling: lag %lu", lag); + pg_usleep(BACK_PRESSURE_DELAY); + } + else + break; } } diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c index 02ca133d37a..190eeb71fb9 100644 --- a/src/backend/tcop/zenith_wal_redo.c +++ b/src/backend/tcop/zenith_wal_redo.c @@ -72,6 +72,7 @@ #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" +#include "access/xlogrecovery.h" #include "catalog/pg_class.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 91c4312250d..c06b957a785 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -40,8 +40,8 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" -#include "access/xlog_internal.h" #include "access/xloginsert.h" +#include "access/xlog_internal.h" #include "access/xlogprefetcher.h" #include "access/xlogrecovery.h" #include "catalog/namespace.h" @@ -86,7 +86,7 @@ #include "replication/syncrep.h" #include "replication/walreceiver.h" #include "replication/walsender.h" -#include "replication/walproposer.h" +#include "replication/walpropshim.h" #include "storage/bufmgr.h" #include "storage/dsm_impl.h" #include "storage/fd.h" @@ -2361,28 +2361,6 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, - { - {"wal_acceptor_reconnect", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Timeout for reconnecting to offline wal acceptor."), - NULL, - GUC_UNIT_MS - }, - &wal_acceptor_reconnect_timeout, - 1000, 0, INT_MAX, - NULL, NULL, NULL - }, - - { - {"wal_acceptor_connect_timeout", PGC_SIGHUP, REPLICATION_STANDBY, - gettext_noop("Timeout after which give up connection attempt to safekeeper."), - NULL, - GUC_UNIT_MS - }, - &wal_acceptor_connect_timeout, - 5000, 0, INT_MAX, - NULL, NULL, NULL - }, - { {"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS, gettext_noop("Sets the maximum number of concurrent connections."), @@ -4783,17 +4761,6 @@ static struct config_string ConfigureNamesString[] = check_backtrace_functions, assign_backtrace_functions, NULL }, - { - {"safekeepers", PGC_POSTMASTER, UNGROUPED, - gettext_noop("List of Neon WAL acceptors (host:port)"), - NULL, - GUC_LIST_INPUT | GUC_LIST_QUOTE - }, - &wal_acceptors_list, - "", - NULL, NULL, NULL - }, - /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d28c20fe0a1..97c7d560db1 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -102,7 +102,6 @@ extern PGDLLIMPORT int wal_level; #define XLogArchivingAlways() \ (AssertMacro(XLogArchiveMode == ARCHIVE_MODE_OFF || wal_level >= WAL_LEVEL_REPLICA), XLogArchiveMode == ARCHIVE_MODE_ALWAYS) - /* * Is WAL-logging necessary for archival or log-shipping, or can we skip * WAL-logging if we fsync() the data before committing instead? @@ -217,7 +216,6 @@ extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn); extern void xlog_redo(XLogReaderState *record); extern void xlog_desc(StringInfo buf, XLogReaderState *record); extern const char *xlog_identify(uint8 info); -extern void xlog_outdesc(StringInfo buf, XLogReaderState *record); extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli); diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h index cc8b5493bb9..a47e3eeb1f5 100644 --- a/src/include/access/xlogdefs.h +++ b/src/include/access/xlogdefs.h @@ -28,14 +28,6 @@ typedef uint64 XLogRecPtr; #define InvalidXLogRecPtr 0 #define XLogRecPtrIsInvalid(r) ((r) == InvalidXLogRecPtr) -/* - * Maximum possible XLogRecPtr value. Currently used by back pressure - * mechanism to distinguish the unknown replica flush/write position. - * This significantly simplifies comparison and checks as we always - * look for the minimal value. - */ -#define UnknownXLogRecPtr ((XLogRecPtr) ~0) - /* * First LSN to use for "fake" LSNs. * diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h deleted file mode 100644 index c5a5b76268e..00000000000 --- a/src/include/replication/walproposer.h +++ /dev/null @@ -1,565 +0,0 @@ -#ifndef __WALPROPOSER_H__ -#define __WALPROPOSER_H__ - -#include "access/xlogdefs.h" -#include "postgres.h" -#include "port.h" -#include "access/xlog_internal.h" -#include "access/transam.h" -#include "nodes/replnodes.h" -#include "utils/uuid.h" -#include "replication/walreceiver.h" - -#define SK_MAGIC 0xCafeCeefu -#define SK_PROTOCOL_VERSION 2 - -#define MAX_SAFEKEEPERS 32 -#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */ -#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */ -#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */ -#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */ - -/* - * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, - * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 - */ -#define WL_NO_EVENTS 0 - -extern char* wal_acceptors_list; -extern int wal_acceptor_reconnect_timeout; -extern int wal_acceptor_connect_timeout; -extern bool am_wal_proposer; - -struct WalProposerConn; /* Defined in libpqwalproposer */ -typedef struct WalProposerConn WalProposerConn; - -struct WalMessage; -typedef struct WalMessage WalMessage; - -extern char *zenith_timeline_walproposer; -extern char *zenith_tenant_walproposer; -extern char *zenith_pageserver_connstring_walproposer; - -/* Possible return values from ReadPGAsync */ -typedef enum -{ - /* The full read was successful. buf now points to the data */ - PG_ASYNC_READ_SUCCESS, - /* The read is ongoing. Wait until the connection is read-ready, then try - * again. */ - PG_ASYNC_READ_TRY_AGAIN, - /* Reading failed. Check PQerrorMessage(conn) */ - PG_ASYNC_READ_FAIL, -} PGAsyncReadResult; - -/* Possible return values from WritePGAsync */ -typedef enum -{ - /* The write fully completed */ - PG_ASYNC_WRITE_SUCCESS, - /* The write started, but you'll need to call PQflush some more times - * to finish it off. We just tried, so it's best to wait until the - * connection is read- or write-ready to try again. - * - * If it becomes read-ready, call PQconsumeInput and flush again. If it - * becomes write-ready, just call PQflush. - */ - PG_ASYNC_WRITE_TRY_FLUSH, - /* Writing failed. Check PQerrorMessage(conn) */ - PG_ASYNC_WRITE_FAIL, -} PGAsyncWriteResult; - -/* - * WAL safekeeper state, which is used to wait for some event. - * - * States are listed here in the order that they're executed. - * - * Most states, upon failure, will move back to SS_OFFLINE by calls to - * ResetConnection or ShutdownConnection. - */ -typedef enum -{ - /* - * Does not have an active connection and will stay that way until - * further notice. - * - * Moves to SS_CONNECTING_WRITE by calls to ResetConnection. - */ - SS_OFFLINE, - - /* - * Connecting states. "_READ" waits for the socket to be available for - * reading, "_WRITE" waits for writing. There's no difference in the code - * they execute when polled, but we have this distinction in order to - * recreate the event set in HackyRemoveWalProposerEvent. - * - * After the connection is made, "START_WAL_PUSH" query is sent. - */ - SS_CONNECTING_WRITE, - SS_CONNECTING_READ, - - /* - * Waiting for the result of the "START_WAL_PUSH" command. - * - * After we get a successful result, sends handshake to safekeeper. - */ - SS_WAIT_EXEC_RESULT, - - /* - * Executing the receiving half of the handshake. After receiving, moves to - * SS_VOTING. - */ - SS_HANDSHAKE_RECV, - - /* - * Waiting to participate in voting, but a quorum hasn't yet been reached. - * This is an idle state - we do not expect AdvancePollState to be called. - * - * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a - * quorum of handshakes. - */ - SS_VOTING, - - /* - * Already sent voting information, waiting to receive confirmation from the - * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet. - */ - SS_WAIT_VERDICT, - - /* Need to flush ProposerElected message. */ - SS_SEND_ELECTED_FLUSH, - - /* - * Waiting for quorum to send WAL. Idle state. If the socket becomes - * read-ready, the connection has been closed. - * - * Moves to SS_ACTIVE only by call to StartStreaming. - */ - SS_IDLE, - - /* - * Active phase, when we acquired quorum and have WAL to send or feedback - * to read. - */ - SS_ACTIVE, -} SafekeeperState; - -/* Consensus logical timestamp. */ -typedef uint64 term_t; - -/* neon storage node id */ -typedef uint64 NNodeId; - -/* - * Proposer <-> Acceptor messaging. - */ - -/* Initial Proposer -> Acceptor message */ -typedef struct ProposerGreeting -{ - uint64 tag; /* message tag */ - uint32 protocolVersion; /* proposer-safekeeper protocol version */ - uint32 pgVersion; - pg_uuid_t proposerId; - uint64 systemId; /* Postgres system identifier */ - uint8 ztimelineid[16]; /* Zenith timeline id */ - uint8 ztenantid[16]; - TimeLineID timeline; - uint32 walSegSize; -} ProposerGreeting; - -typedef struct AcceptorProposerMessage -{ - uint64 tag; -} AcceptorProposerMessage; - -/* - * Acceptor -> Proposer initial response: the highest term acceptor voted for. - */ -typedef struct AcceptorGreeting -{ - AcceptorProposerMessage apm; - term_t term; - NNodeId nodeId; -} AcceptorGreeting; - -/* - * Proposer -> Acceptor vote request. - */ -typedef struct VoteRequest -{ - uint64 tag; - term_t term; - pg_uuid_t proposerId; /* for monitoring/debugging */ -} VoteRequest; - -/* Element of term switching chain. */ -typedef struct TermSwitchEntry -{ - term_t term; - XLogRecPtr lsn; -} TermSwitchEntry; - -typedef struct TermHistory -{ - uint32 n_entries; - TermSwitchEntry *entries; -} TermHistory; - -/* Vote itself, sent from safekeeper to proposer */ -typedef struct VoteResponse { - AcceptorProposerMessage apm; - term_t term; - uint64 voteGiven; - /* - * Safekeeper flush_lsn (end of WAL) + history of term switches allow - * proposer to choose the most advanced one. - */ - XLogRecPtr flushLsn; - XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */ - TermHistory termHistory; - XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */ -} VoteResponse; - -/* - * Proposer -> Acceptor message announcing proposer is elected and communicating - * epoch history to it. - */ -typedef struct ProposerElected -{ - uint64 tag; - term_t term; - /* proposer will send since this point */ - XLogRecPtr startStreamingAt; - /* history of term switches up to this proposer */ - TermHistory *termHistory; - /* timeline globally starts at this LSN */ - XLogRecPtr timelineStartLsn; -} ProposerElected; - -/* - * Header of request with WAL message sent from proposer to safekeeper. - */ -typedef struct AppendRequestHeader -{ - uint64 tag; - term_t term; /* term of the proposer */ - /* - * LSN since which current proposer appends WAL (begin_lsn of its first - * record); determines epoch switch point. - */ - XLogRecPtr epochStartLsn; - XLogRecPtr beginLsn; /* start position of message in WAL */ - XLogRecPtr endLsn; /* end position of message in WAL */ - XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */ - /* - * minimal LSN which may be needed for recovery of some safekeeper (end lsn - * + 1 of last chunk streamed to everyone) - */ - XLogRecPtr truncateLsn; - pg_uuid_t proposerId; /* for monitoring/debugging */ -} AppendRequestHeader; - -/* - * Hot standby feedback received from replica - */ -typedef struct HotStandbyFeedback -{ - TimestampTz ts; - FullTransactionId xmin; - FullTransactionId catalog_xmin; -} HotStandbyFeedback; - - -typedef struct ReplicationFeedback -{ - // current size of the timeline on pageserver - uint64 currentClusterSize; - // standby_status_update fields that safekeeper received from pageserver - XLogRecPtr ps_writelsn; - XLogRecPtr ps_flushlsn; - XLogRecPtr ps_applylsn; - TimestampTz ps_replytime; -} ReplicationFeedback; - - -typedef struct WalproposerShmemState -{ - slock_t mutex; - ReplicationFeedback feedback; - term_t mineLastElectedTerm; -} WalproposerShmemState; - -/* - * Report safekeeper state to proposer - */ -typedef struct AppendResponse -{ - AcceptorProposerMessage apm; - /* - * Current term of the safekeeper; if it is higher than proposer's, the - * compute is out of date. - */ - term_t term; - // TODO: add comment - XLogRecPtr flushLsn; - // Safekeeper reports back his awareness about which WAL is committed, as - // this is a criterion for walproposer --sync mode exit - XLogRecPtr commitLsn; - HotStandbyFeedback hs; - // Feedback recieved from pageserver includes standby_status_update fields - // and custom zenith feedback. - // This part of the message is extensible. - ReplicationFeedback rf; -} AppendResponse; - -// ReplicationFeedback is extensible part of the message that is parsed separately -// Other fields are fixed part -#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) - - -/* - * Descriptor of safekeeper - */ -typedef struct Safekeeper -{ - char const* host; - char const* port; - char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */ - - /* - * postgres protocol connection to the WAL acceptor - * - * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we - * reach SS_ACTIVE; not before. - */ - WalProposerConn* conn; - /* - * Temporary buffer for the message being sent to the safekeeper. - */ - StringInfoData outbuf; - /* - * WAL reader, allocated for each safekeeper. - */ - XLogReaderState* xlogreader; - - /* - * Streaming will start here; must be record boundary. - */ - XLogRecPtr startStreamingAt; - - bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */ - XLogRecPtr streamingAt; /* current streaming position */ - AppendRequestHeader appendRequest; /* request for sending to safekeeper */ - - int eventPos; /* position in wait event set. Equal to -1 if no event */ - SafekeeperState state; /* safekeeper state machine state */ - TimestampTz startedConnAt; /* when connection attempt started */ - AcceptorGreeting greetResponse; /* acceptor greeting */ - VoteResponse voteResponse; /* the vote */ - AppendResponse appendResponse; /* feedback for master */ -} Safekeeper; - - -int CompareLsn(const void *a, const void *b); -char* FormatSafekeeperState(SafekeeperState state); -void AssertEventsOkForState(uint32 events, Safekeeper* sk); -uint32 SafekeeperStateDesiredEvents(SafekeeperState state); -char* FormatEvents(uint32 events); -void WalProposerMain(Datum main_arg); -void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); -bool HexDecodeString(uint8 *result, char *input, int nbytes); -uint32 pq_getmsgint32_le(StringInfo msg); -uint64 pq_getmsgint64_le(StringInfo msg); -void pq_sendint32_le(StringInfo buf, uint32 i); -void pq_sendint64_le(StringInfo buf, uint64 i); -void WalProposerPoll(void); -void WalProposerRegister(void); -void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr); -void XLogWalPropClose(XLogRecPtr recptr); -void ProcessStandbyReply(XLogRecPtr writePtr, - XLogRecPtr flushPtr, - XLogRecPtr applyPtr, - TimestampTz replyTime, - bool replyRequested); -void PhysicalConfirmReceivedLocation(XLogRecPtr lsn); -void ProcessStandbyHSFeedback(TimestampTz replyTime, - TransactionId feedbackXmin, - uint32 feedbackEpoch, - TransactionId feedbackCatalogXmin, - uint32 feedbackCatalogEpoch); -void ParseReplicationFeedbackMessage(StringInfo reply_message, - ReplicationFeedback *rf); -void StartReplication(StartReplicationCmd *cmd); -void WalProposerSync(int argc, char *argv[]); - -Size WalproposerShmemSize(void); -bool WalproposerShmemInit(void); -void replication_feedback_set(ReplicationFeedback *rf); -void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); - -/* libpqwalproposer hooks & helper type */ - -/* Re-exported PostgresPollingStatusType */ -typedef enum -{ - WP_CONN_POLLING_FAILED = 0, - WP_CONN_POLLING_READING, - WP_CONN_POLLING_WRITING, - WP_CONN_POLLING_OK, - /* - * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused. - * We've removed it here to avoid clutter. - */ -} WalProposerConnectPollStatusType; - -/* Re-exported and modified ExecStatusType */ -typedef enum -{ - /* We received a single CopyBoth result */ - WP_EXEC_SUCCESS_COPYBOTH, - /* Any success result other than a single CopyBoth was received. The specifics of the result - * were already logged, but it may be useful to provide an error message indicating which - * safekeeper messed up. - * - * Do not expect PQerrorMessage to be appropriately set. */ - WP_EXEC_UNEXPECTED_SUCCESS, - /* No result available at this time. Wait until read-ready, then call again. Internally, this is - * returned when PQisBusy indicates that PQgetResult would block. */ - WP_EXEC_NEEDS_INPUT, - /* Catch-all failure. Check PQerrorMessage. */ - WP_EXEC_FAILED, -} WalProposerExecStatusType; - -/* Re-exported ConnStatusType */ -typedef enum -{ - WP_CONNECTION_OK, - WP_CONNECTION_BAD, - - /* - * The original ConnStatusType has many more tags, but requests that - * they not be relied upon (except for displaying to the user). We - * don't need that extra functionality, so we collect them into a - * single tag here. - */ - WP_CONNECTION_IN_PROGRESS, -} WalProposerConnStatusType; - -/* Re-exported PQerrorMessage */ -typedef char* (*walprop_error_message_fn) (WalProposerConn* conn); - -/* Re-exported PQstatus */ -typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn); - -/* Re-exported PQconnectStart */ -typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo); - -/* Re-exported PQconectPoll */ -typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn); - -/* Blocking wrapper around PQsendQuery */ -typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query); - -/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */ -typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn); - -/* Re-exported PQsocket */ -typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn); - -/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */ -typedef int (*walprop_flush_fn) (WalProposerConn* conn); - -/* Re-exported PQfinish */ -typedef void (*walprop_finish_fn) (WalProposerConn* conn); - -/* - * Ergonomic wrapper around PGgetCopyData - * - * Reads a CopyData block from a safekeeper, setting *amount to the number - * of bytes returned. - * - * This function is allowed to assume certain properties specific to the - * protocol with the safekeepers, so it should not be used as-is for any - * other purpose. - * - * Note: If possible, using is generally preferred, because it - * performs a bit of extra checking work that's always required and is normally - * somewhat verbose. - */ -typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn, - char** buf, - int* amount); - -/* - * Ergonomic wrapper around PQputCopyData + PQflush - * - * Starts to write a CopyData block to a safekeeper. - * - * For information on the meaning of return codes, refer to PGAsyncWriteResult. - */ -typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn, - void const* buf, - size_t size); - -/* - * Blocking equivalent to walprop_async_write_fn - * - * Returns 'true' if successful, 'false' on failure. - */ -typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size); - -/* All libpqwalproposer exported functions collected together. */ -typedef struct WalProposerFunctionsType -{ - walprop_error_message_fn walprop_error_message; - walprop_status_fn walprop_status; - walprop_connect_start_fn walprop_connect_start; - walprop_connect_poll_fn walprop_connect_poll; - walprop_send_query_fn walprop_send_query; - walprop_get_query_result_fn walprop_get_query_result; - walprop_socket_fn walprop_socket; - walprop_flush_fn walprop_flush; - walprop_finish_fn walprop_finish; - walprop_async_read_fn walprop_async_read; - walprop_async_write_fn walprop_async_write; - walprop_blocking_write_fn walprop_blocking_write; -} WalProposerFunctionsType; - -/* Allow the above functions to be "called" with normal syntax */ -#define walprop_error_message(conn) \ - WalProposerFunctions->walprop_error_message(conn) -#define walprop_status(conn) \ - WalProposerFunctions->walprop_status(conn) -#define walprop_connect_start(conninfo) \ - WalProposerFunctions->walprop_connect_start(conninfo) -#define walprop_connect_poll(conn) \ - WalProposerFunctions->walprop_connect_poll(conn) -#define walprop_send_query(conn, query) \ - WalProposerFunctions->walprop_send_query(conn, query) -#define walprop_get_query_result(conn) \ - WalProposerFunctions->walprop_get_query_result(conn) -#define walprop_set_nonblocking(conn, arg) \ - WalProposerFunctions->walprop_set_nonblocking(conn, arg) -#define walprop_socket(conn) \ - WalProposerFunctions->walprop_socket(conn) -#define walprop_flush(conn) \ - WalProposerFunctions->walprop_flush(conn) -#define walprop_finish(conn) \ - WalProposerFunctions->walprop_finish(conn) -#define walprop_async_read(conn, buf, amount) \ - WalProposerFunctions->walprop_async_read(conn, buf, amount) -#define walprop_async_write(conn, buf, size) \ - WalProposerFunctions->walprop_async_write(conn, buf, size) -#define walprop_blocking_write(conn, buf, size) \ - WalProposerFunctions->walprop_blocking_write(conn, buf, size) - -/* - * The runtime location of the libpqwalproposer functions. - * - * This pointer is set by the initializer in libpqwalproposer, so that we - * can use it later. - */ -extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions; - -#endif diff --git a/src/include/replication/walpropshim.h b/src/include/replication/walpropshim.h new file mode 100644 index 00000000000..07757580cc9 --- /dev/null +++ b/src/include/replication/walpropshim.h @@ -0,0 +1,19 @@ +/* + * walpropshim.h + * various hooks for the walproposer component of the Neon extension. + */ + +#ifndef __WALPROPOSER_H__ +#define __WALPROPOSER_H__ + +/* + * Set to true only in standalone run of `postgres --sync-safekeepers`. + * See also the top comment in contrib/neon/walproposer.c + */ +extern PGDLLIMPORT bool syncSafekeepers; +extern PGDLLIMPORT void (*WalProposerInit) (XLogRecPtr flushRecPtr, uint64 systemId); +extern PGDLLIMPORT void (*WalProposerStart) (void); + +void WalProposerSync(int argc, char *argv[]); + +#endif diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index 599687ec428..08c3ff62603 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -48,8 +48,25 @@ extern void WalSndInitStopping(void); extern void WalSndWaitStopping(void); extern void HandleWalSndInitStopping(void); extern void WalSndRqstFileReload(void); -extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush, XLogRecPtr* apply); -extern uint64 backpressure_lag(void); + +/* + * Hook to check for WAL receiving backpressure. + * Return value in microseconds */ +extern uint64 (*delay_backend_us)(void); + +/* expose these so that they can be reused by the neon walproposer extension */ +extern void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); +extern TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); +extern void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr, + XLogRecPtr applyPtr, TimestampTz replyTime, + bool replyRequested); +void PhysicalConfirmReceivedLocation(XLogRecPtr lsn); +void ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch); + /* * Remember that we want to wakeup walsenders later * From e06d8c7a3bf240cb1d4f133e012f6221f978097d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 25 Aug 2022 01:32:51 +0300 Subject: [PATCH 05/56] Prevent access to uninitalized shaerd memory in InstallXLogFileSegment, which is used for safekeepers-sync --- src/backend/access/transam/xlog.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6a5f9331b0b..8167f92ad07 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3314,13 +3314,16 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, XLogFilePath(path, tli, *segno, wal_segment_size); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - if (!XLogCtl->InstallXLogFileSegmentActive) + if (XLogCtl) { - LWLockRelease(ControlFileLock); - return false; + /* Neon: in case of sync-safekeepers shared memory is not inialized */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (!XLogCtl->InstallXLogFileSegmentActive) + { + LWLockRelease(ControlFileLock); + return false; + } } - if (!find_free) { /* Force installation: get rid of any pre-existing segment file */ @@ -3334,7 +3337,8 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, if ((*segno) >= max_segno) { /* Failed to find a free slot within specified range */ - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); return false; } (*segno)++; @@ -3348,12 +3352,14 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, */ if (durable_rename_excl(tmppath, path, LOG) != 0) { - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); /* durable_rename_excl already emitted log message */ return false; } - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); return true; } From 9b2b57472148db461634177bd71b6eac122317cb Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 29 Aug 2022 16:44:51 +0300 Subject: [PATCH 06/56] Remove Dockerfile, it's now in the neon repo --- Dockerfile | 73 ------------------------------------------------------ 1 file changed, 73 deletions(-) delete mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 11681c9cb16..00000000000 --- a/Dockerfile +++ /dev/null @@ -1,73 +0,0 @@ -# Allow specifiyng the different compute-tools tag, so we were able to always use -# the locally built image. -ARG COMPUTE_TOOLS_TAG=latest - -# -# Image with pre-built tools -# -FROM neondatabase/compute-tools:$COMPUTE_TOOLS_TAG AS compute-deps -# Only to get ready compute_ctl binary as deppendency - -# -# Image with Postgres build deps -# -FROM debian:buster-slim AS build-deps - -RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libcurl4-openssl-dev libossp-uuid-dev - -# -# Image with built Postgres -# -FROM build-deps AS pg-build - -# Add user postgres -RUN adduser postgres -RUN mkdir /pg && chown postgres:postgres /pg - -# Copy source files -COPY . /pg/ - -# Build and install Postgres locally -RUN mkdir /pg/compute_build && cd /pg/compute_build && \ - ../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \ - # Install main binaries and contribs - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/neon install && \ - # Install headers - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install - -USER postgres -WORKDIR /pg - -# -# Final compute node image to be exported -# -FROM debian:buster-slim - -# libreadline-dev is required to run psql -RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev - -# Add user postgres -RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ - echo "postgres:test_console_pass" | chpasswd && \ - mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \ - chown -R postgres:postgres /var/db/postgres && \ - chmod 0750 /var/db/postgres/compute - -# Copy ready Postgres binaries -COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local - -# Copy binaries from compute-tools -COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl - -# XXX: temporary symlink for compatibility with old control-plane -RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl - -# Add postgres shared objects to the search path -RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig - -USER postgres - -ENTRYPOINT ["/usr/local/bin/compute_ctl"] From 261fc36fd01ea285bb3bbc2482a78c5cd8b9888a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 2 Sep 2022 22:14:35 +0300 Subject: [PATCH 07/56] Move backpressure throttling implementation to neon extension (#203) * Move backpressure throttling implementation to neon extension and function for monitoring throttling time * Update src/include/miscadmin.h Co-authored-by: Heikki Linnakangas Co-authored-by: Heikki Linnakangas --- src/backend/tcop/postgres.c | 45 ++++++++----------------------------- src/include/miscadmin.h | 4 ++++ 2 files changed, 13 insertions(+), 36 deletions(-) diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index dff17874c2c..65bb69ca3e5 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -180,6 +180,8 @@ static ProcSignalReason RecoveryConflictReason; static MemoryContext row_description_context = NULL; static StringInfoData row_description_buf; +process_interrupts_callback_t ProcessInterruptsCallback; + /* ---------------------------------------------------------------- * decls for routines only used in this file * ---------------------------------------------------------------- @@ -3152,14 +3154,15 @@ RecoveryConflictInterrupt(ProcSignalReason reason) * return; another interrupt could have arrived. But we promise that * any pre-existing one will have been serviced.) */ -static void -ProcessInterrupts_pg(void) +void +ProcessInterrupts(void) { /* OK to accept any interrupts now? */ if (InterruptHoldoffCount != 0 || CritSectionCount != 0) return; InterruptPending = false; + Retry: if (ProcDiePending) { ProcDiePending = false; @@ -3404,42 +3407,12 @@ ProcessInterrupts_pg(void) if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); -} - -void -ProcessInterrupts(void) -{ - uint64 lag; - - if (InterruptHoldoffCount != 0 || CritSectionCount != 0) - return; - // Don't throttle read only transactions and wal sender - if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny())) + /* Call registered callback if any */ + if (ProcessInterruptsCallback) { - ProcessInterrupts_pg(); - return; - } - - #define BACK_PRESSURE_DELAY 10000L // 0.01 sec - while(true) - { - ProcessInterrupts_pg(); - - if (delay_backend_us != NULL) - { - // Suspend writers until replicas catch up - lag = delay_backend_us(); - if (lag <= 0) - break; - - set_ps_display("backpressure throttling"); - - elog(DEBUG2, "backpressure throttling: lag %lu", lag); - pg_usleep(BACK_PRESSURE_DELAY); - } - else - break; + if (ProcessInterruptsCallback()) + goto Retry; } } diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index cf3698fb141..3a33e404e4a 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -107,6 +107,10 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount; /* in tcop/postgres.c */ extern void ProcessInterrupts(void); +/* Callback called by ProcessInterrupts in the loop while it is returning true. */ +typedef bool (*process_interrupts_callback_t)(void); +extern process_interrupts_callback_t ProcessInterruptsCallback; + /* Test whether an interrupt is pending */ #ifndef WIN32 #define INTERRUPTS_PENDING_CONDITION() \ From f5cb05beb3d387d85f429368b0b760883cceeea8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 4 Sep 2022 21:37:51 +0300 Subject: [PATCH 08/56] Merge last written cache lsn with new main branch (#201) --- src/backend/access/gin/gininsert.c | 3 +- src/backend/access/gist/gistbuild.c | 10 +- src/backend/access/spgist/spginsert.c | 4 +- src/backend/access/transam/xlog.c | 200 ++++++++++++++++++++--- src/backend/commands/dbcommands.c | 29 +++- src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/utils/misc/guc.c | 10 ++ src/include/access/xlog.h | 15 +- 8 files changed, 244 insertions(+), 28 deletions(-) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 37804a7852d..f2e6147e307 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -421,8 +421,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM); } - SetLastWrittenPageLSN(XactLastRecEnd); smgr_end_unlogged_build(index->rd_smgr); diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 6afc711f8fe..aaa7ab8acc0 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -342,9 +342,11 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, + index->rd_smgr->smgr_rnode.node, + MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM); } - SetLastWrittenPageLSN(XactLastRecEnd); - smgr_end_unlogged_build(index->rd_smgr); } @@ -475,7 +477,9 @@ gist_indexsortbuild(GISTBuildState *state) lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, levelstate->pages[0], true); - SetLastWrittenPageLSN(lsn); + SetLastWrittenLSNForBlock(lsn, state->indexrel->rd_smgr->smgr_rnode.node, + MAIN_FORKNUM, GIST_ROOT_BLKNO); + SetLastWrittenLSNForRelation(lsn, state->indexrel->rd_smgr->smgr_rnode.node, MAIN_FORKNUM); } pfree(levelstate->pages[0]); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 542eb5c4d0b..8262a2e4e67 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -143,8 +143,10 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, + MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM); } - SetLastWrittenPageLSN(XactLastRecEnd); smgr_end_unlogged_build(index->rd_smgr); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8167f92ad07..07466bc19d7 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -85,6 +85,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/buf_internals.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/large_object.h" @@ -137,6 +138,7 @@ int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; uint64 predefined_sysidentifier; +int lastWrittenLsnCacheSize; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -199,6 +201,25 @@ const struct config_enum_entry archive_mode_options[] = { {NULL, 0, false} }; +typedef struct LastWrittenLsnCacheEntry +{ + BufferTag key; + XLogRecPtr lsn; + /* double linked list for LRU replacement algorithm */ + dlist_node lru_node; +} LastWrittenLsnCacheEntry; + + +/* + * Cache of last written LSN for each relation chunk (hash bucket). + * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last + * relation metadata update. + * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"), + * pages are replaced using LRU algorithm, based on L2-list. + * Access to this cache is protected by 'LastWrittenLsnLock'. + */ +static HTAB *lastWrittenLsnCache; + /* * Statistics for current checkpoint are collected in this global struct. * Because only the checkpointer or a stand-alone backend can perform @@ -552,7 +573,17 @@ typedef struct XLogCtlData * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled. */ XLogRecPtr lastFpwDisableRecPtr; - XLogRecPtr lastWrittenPageLSN; + + /* + * Maximal last written LSN for pages not present in lastWrittenLsnCache + */ + XLogRecPtr maxLastWrittenLsn; + + /* + * Double linked list to implement LRU replacement policy for last written LSN cache. + * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'. + */ + dlist_head lastWrittenLsnLRU; /* neon: copy of startup's RedoStartLSN for walproposer's use */ XLogRecPtr RedoStartLSN; @@ -575,6 +606,8 @@ static WALInsertLockPadded *WALInsertLocks = NULL; */ static ControlFileData *ControlFile = NULL; +#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */ + /* * Calculate the amount of space left on the page after 'endptr'. Beware * multiple evaluation! @@ -4355,11 +4388,8 @@ LocalProcessControlFile(bool reset) ReadControlFile(); } -/* - * Initialization of shared memory for XLOG - */ -Size -XLOGShmemSize(void) +static Size +XLOGCtlShmemSize(void) { Size size; @@ -4408,6 +4438,16 @@ XLOGShmemSize(void) return size; } +/* + * Initialization of shared memory for XLOG + */ +Size +XLOGShmemSize(void) +{ + return XLOGCtlShmemSize() + + hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry)); +} + void XLOGShmemInit(void) { @@ -4437,6 +4477,15 @@ XLOGShmemInit(void) XLogCtl = (XLogCtlData *) ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); + { + static HASHCTL info; + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(LastWrittenLsnCacheEntry); + lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache", + lastWrittenLsnCacheSize, lastWrittenLsnCacheSize, + &info, + HASH_ELEM | HASH_BLOBS); + } localControlFile = ControlFile; ControlFile = (ControlFileData *) ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile); @@ -5623,7 +5672,8 @@ StartupXLOG(void) XLogCtl->LogwrtRqst.Write = EndOfLog; XLogCtl->LogwrtRqst.Flush = EndOfLog; - XLogCtl->lastWrittenPageLSN = EndOfLog; + XLogCtl->maxLastWrittenLsn = EndOfLog; + dlist_init(&XLogCtl->lastWrittenLsnLRU); /* * Preallocate additional log files, if wanted. @@ -6051,29 +6101,141 @@ GetInsertRecPtr(void) } /* - * GetLastWrittenPageLSN -- Returns maximal LSN of written page + * GetLastWrittenLSN -- Returns maximal LSN of written page. + * It returns an upper bound for the last written LSN of a given page, + * either from a cached last written LSN or a global maximum last written LSN. + * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. + * If cache is large enough ,iterting through all hash items may be rather expensive. + * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical. */ XLogRecPtr -GetLastWrittenPageLSN(void) +GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) { XLogRecPtr lsn; - SpinLockAcquire(&XLogCtl->info_lck); - lsn = XLogCtl->lastWrittenPageLSN; - SpinLockRelease(&XLogCtl->info_lck); + LastWrittenLsnCacheEntry* entry; + + LWLockAcquire(LastWrittenLsnLock, LW_SHARED); + + /* Maximal last written LSN among all non-cached pages */ + lsn = XLogCtl->maxLastWrittenLsn; + + if (rnode.relNode != InvalidOid) + { + BufferTag key; + key.rnode = rnode; + key.forkNum = forknum; + key.blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET; + entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); + if (entry != NULL) + lsn = entry->lsn; + } + else + { + HASH_SEQ_STATUS seq; + /* Find maximum of all cached LSNs */ + hash_seq_init(&seq, lastWrittenLsnCache); + while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) + { + if (entry->lsn > lsn) + lsn = entry->lsn; + } + } + LWLockRelease(LastWrittenLsnLock); return lsn; } /* - * SetLastWrittenPageLSN -- Set maximal LSN of written page + * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range. + * We maintain cache of last written LSNs with limited size and LRU replacement + * policy. To reduce cache size we store max LSN not for each page, but for + * bucket (1024 blocks). This cache allows to use old LSN when + * requesting pages of unchanged or appended relations. + * + * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated. + * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions. */ void -SetLastWrittenPageLSN(XLogRecPtr lsn) +SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber till) { - SpinLockAcquire(&XLogCtl->info_lck); - if (lsn > XLogCtl->lastWrittenPageLSN) - XLogCtl->lastWrittenPageLSN = lsn; - SpinLockRelease(&XLogCtl->info_lck); + if (lsn == InvalidXLogRecPtr) + return; + + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + if (rnode.relNode == InvalidOid) + { + if (lsn > XLogCtl->maxLastWrittenLsn) + XLogCtl->maxLastWrittenLsn = lsn; + } + else + { + LastWrittenLsnCacheEntry* entry; + BufferTag key; + bool found; + BlockNumber bucket; + + key.rnode = rnode; + key.forkNum = forknum; + for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET; + bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET; + bucket++) + { + key.blockNum = bucket; + entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); + if (found) + { + if (lsn > entry->lsn) + entry->lsn = lsn; + /* Unlink from LRU list */ + dlist_delete(&entry->lru_node); + } + else + { + entry->lsn = lsn; + if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize) + { + /* Replace least recently used entry */ + LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&XLogCtl->lastWrittenLsnLRU)); + /* Adjust max LSN for not cached relations/chunks if needed */ + if (victim->lsn > XLogCtl->maxLastWrittenLsn) + XLogCtl->maxLastWrittenLsn = victim->lsn; + + hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); + } + } + /* Link to the end of LRU list */ + dlist_push_tail(&XLogCtl->lastWrittenLsnLRU, &entry->lru_node); + } + } + LWLockRelease(LastWrittenLsnLock); +} + +/* + * SetLastWrittenLSNForBlock -- Set maximal LSN for block + */ +void +SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +{ + SetLastWrittenLSNForBlockRange(lsn, rnode, forknum, blkno, blkno); +} + +/* + * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata + */ +void +SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum) +{ + SetLastWrittenLSNForBlock(lsn, rnode, forknum, REL_METADATA_PSEUDO_BLOCKNO); +} + +/* + * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database + */ +void +SetLastWrittenLSNForDatabase(XLogRecPtr lsn) +{ + RelFileNode dummyNode = {InvalidOid, InvalidOid, InvalidOid}; + SetLastWrittenLSNForBlock(lsn, dummyNode, MAIN_FORKNUM, 0); } void @@ -6324,7 +6486,7 @@ LogCheckpointEnd(bool restartpoint) average_sync_time = 0; if (CheckpointStats.ckpt_sync_rels > 0) average_sync_time = CheckpointStats.ckpt_agg_sync_time / - CheckpointStats.ckpt_sync_rels; + CheckpointStats.ckpt_sync_rels; average_msecs = (long) ((average_sync_time + 999) / 1000); if (restartpoint) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 71510007476..061be47f3a5 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -486,6 +486,8 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo) lsn = XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE_WAL_LOG); + SetLastWrittenLSNForDatabase(lsn); + /* As always, WAL must hit the disk before the data update does. */ XLogFlush(lsn); } @@ -613,6 +615,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, /* Record the filesystem change in XLOG */ { xl_dbase_create_file_copy_rec xlrec; + XLogRecPtr lsn; xlrec.db_id = dst_dboid; xlrec.tablespace_id = dsttablespace; @@ -623,8 +626,10 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid, XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_create_file_copy_rec)); - (void) XLogInsert(RM_DBASE_ID, + lsn = XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE); + + SetLastWrittenLSNForDatabase(lsn); } pfree(srcpath); pfree(dstpath); @@ -2038,6 +2043,7 @@ movedb(const char *dbname, const char *tblspcname) */ { xl_dbase_create_file_copy_rec xlrec; + XLogRecPtr lsn; xlrec.db_id = db_id; xlrec.tablespace_id = dst_tblspcoid; @@ -2048,8 +2054,10 @@ movedb(const char *dbname, const char *tblspcname) XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_create_file_copy_rec)); - (void) XLogInsert(RM_DBASE_ID, + lsn = XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE); + // TODO: Do we really need to set the LSN here? + SetLastWrittenLSNForDatabase(lsn); } /* @@ -3197,6 +3205,15 @@ dbase_redo(XLogReaderState *record) */ copydir(src_path, dst_path, false); + /* + * Make sure any future requests to the page server see the new + * database. + */ + { + XLogRecPtr lsn = record->EndRecPtr; + SetLastWrittenLSNForDatabase(lsn); + } + pfree(src_path); pfree(dst_path); } @@ -3218,6 +3235,14 @@ dbase_redo(XLogReaderState *record) CreateDirAndVersionFile(dbpath, xlrec->db_id, xlrec->tablespace_id, true); pfree(dbpath); + /* + * Make sure any future requests to the page server see the new + * database. + */ + { + XLogRecPtr lsn = record->EndRecPtr; + SetLastWrittenLSNForDatabase(lsn); + } } else if (info == XLOG_DBASE_DROP) { diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c2956..b4652c33ff6 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +LastWrittenLsnLock 48 diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c06b957a785..68cf5da3c4e 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2430,6 +2430,16 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"lsn_cache_size", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Size of las written LSN cache used by Neon."), + NULL + }, + &lastWrittenLsnCacheSize, + 1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */ + NULL, NULL, NULL + }, + { {"temp_buffers", PGC_USERSET, RESOURCES_MEM, gettext_noop("Sets the maximum number of temporary buffers used by each session."), diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 97c7d560db1..dc3325f9050 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -26,11 +26,17 @@ #define SYNC_METHOD_OPEN_DSYNC 4 /* for O_DSYNC */ extern PGDLLIMPORT int sync_method; + extern PGDLLIMPORT XLogRecPtr ProcLastRecPtr; extern PGDLLIMPORT XLogRecPtr XactLastRecEnd; extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; +/* + * Pseudo block number used to associate LSN with relation metadata (relation size) + */ +#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber + extern bool ZenithRecoveryRequested; extern XLogRecPtr zenithLastRec; extern bool zenithWriteOk; @@ -58,6 +64,8 @@ extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +extern int lastWrittenLsnCacheSize; + /* Archive modes */ typedef enum ArchiveMode @@ -250,8 +258,11 @@ extern XLogRecPtr GetLastImportantRecPtr(void); /* neon specifics */ -extern void SetLastWrittenPageLSN(XLogRecPtr lsn); -extern XLogRecPtr GetLastWrittenPageLSN(void); +extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno); +extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber till); +extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn); +extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum); +extern XLogRecPtr GetLastWrittenLSN(RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno); extern void SetRedoStartLsn(XLogRecPtr RedoStartLSN); extern XLogRecPtr GetRedoStartLsn(void); From 80512f1be04bb45ba783f7736a1bb417ec5a2710 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 13 Sep 2022 22:48:39 +0300 Subject: [PATCH 09/56] Local prefetch implementation for Postgres 15 Disabled by default. The plan is to merge this now, so that we can do performance testing quickly, and if it helps, rewrite and review it properly. Author: Konstantin Knizhnik --- src/backend/access/heap/heapam.c | 16 ++++++++++ src/backend/access/heap/vacuumlazy.c | 16 ++++++++++ src/backend/optimizer/path/costsize.c | 2 ++ src/backend/storage/smgr/md.c | 8 +++++ src/backend/storage/smgr/smgr.c | 10 +++++++ src/backend/utils/misc/guc.c | 22 ++++++++++++++ src/include/optimizer/cost.h | 2 ++ src/include/storage/md.h | 1 + src/include/storage/smgr.h | 2 ++ src/test/regress/expected/sysviews.out | 41 +++++++++++++------------- 10 files changed, 100 insertions(+), 20 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index c1ebffced89..3aeab9dd335 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -53,6 +53,7 @@ #include "access/xlogutils.h" #include "catalog/catalog.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "port/atomics.h" #include "port/pg_bitutils.h" @@ -398,6 +399,21 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) */ CHECK_FOR_INTERRUPTS(); + /* Prefetch next block */ + if (enable_seqscan_prefetch) + { + int prefetch_limit = seqscan_prefetch_buffers; + ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata; + if (pbscanwork != NULL && pbscanwork->phsw_chunk_remaining < prefetch_limit) + prefetch_limit = pbscanwork->phsw_chunk_remaining; + if (page + prefetch_limit >= scan->rs_nblocks) + prefetch_limit = scan->rs_nblocks - page - 1; + + smgr_reset_prefetch(RelationGetSmgr(scan->rs_base.rs_rd)); + for (int i = 1; i <= prefetch_limit; i++) + PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, page+i); + } + /* read page using selected strategy */ scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page, RBM_NORMAL, scan->rs_strategy); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 033f2baab3f..1af07d135d7 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -52,6 +52,7 @@ #include "commands/vacuum.h" #include "executor/instrument.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "optimizer/paths.h" #include "pgstat.h" #include "portability/instr_time.h" @@ -972,6 +973,14 @@ lazy_scan_heap(LVRelState *vacrel) */ visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); + if (enable_seqscan_prefetch) + { + int prefetch_limit = Min(rel_pages - blkno - 1, seqscan_prefetch_buffers); + smgr_reset_prefetch(RelationGetSmgr(vacrel->rel)); + for (int i = 1; i <= prefetch_limit; i++) + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, blkno+i); + } + /* Finished preparatory checks. Actually scan the page. */ buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, vacrel->bstrategy); @@ -2429,6 +2438,13 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); + if (enable_seqscan_prefetch) + { + int prefetch_limit = Min(vacrel->dead_items->num_items - index - 1, seqscan_prefetch_buffers); + smgr_reset_prefetch(RelationGetSmgr(vacrel->rel)); + for (int i = 1; i <= prefetch_limit; i++) + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&vacrel->dead_items->items[index + i])); + } vacrel->blkno = tblk; buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL, vacrel->bstrategy); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 0ba26b207b0..ae37ac0644b 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -152,6 +152,8 @@ bool enable_parallel_append = true; bool enable_parallel_hash = true; bool enable_partition_pruning = true; bool enable_async_append = true; +bool enable_seqscan_prefetch = true; +int seqscan_prefetch_buffers = 0; typedef struct { diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 002eb6fbcd8..34dfe7ff97e 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -612,6 +612,14 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return true; } +/* + * mdprefetch() -- Cancel all previous prefetch requests + */ +void +md_reset_prefetch(SMgrRelation reln) +{ +} + /* * mdwriteback() -- Tell the kernel to write pages back to storage. * diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 87260673bc8..d8a9ffae34b 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -38,6 +38,7 @@ static const f_smgr smgr_md = { .smgr_unlink = mdunlink, .smgr_extend = mdextend, .smgr_prefetch = mdprefetch, + .smgr_reset_prefetch = md_reset_prefetch, .smgr_read = mdread, .smgr_write = mdwrite, .smgr_writeback = mdwriteback, @@ -531,6 +532,15 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum); } +/* + * smgr_reset_prefetch() -- Cancel all previos prefetch requests + */ +void +smgr_reset_prefetch(SMgrRelation reln) +{ + (*reln->smgr).smgr_reset_prefetch(reln); +} + /* * smgrread() -- read a particular block from a relation into the supplied * buffer. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 68cf5da3c4e..d55b5dbff04 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1016,6 +1016,17 @@ static const unit_conversion time_unit_conversion_table[] = static struct config_bool ConfigureNamesBool[] = { + { + {"enable_seqscan_prefetch", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the sequence scan next page prefetching."), + NULL, + GUC_EXPLAIN + }, + &enable_seqscan_prefetch, + false, /* temporary disable to be able to merge in main */ + /* true, */ + NULL, NULL, NULL + }, { {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of sequential-scan plans."), @@ -2195,6 +2206,17 @@ static struct config_bool ConfigureNamesBool[] = static struct config_int ConfigureNamesInt[] = { + { + {"seqscan_prefetch_buffers", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Number of subsequent buffer to be prefetched during sequential scan."), + NULL, + GUC_EXPLAIN + }, + &seqscan_prefetch_buffers, + /* 8, 0, 1000, */ + 0, 0, 1000, /* temporary disable to be able to merge in main */ + NULL, NULL, NULL + }, { {"archive_timeout", PGC_SIGHUP, WAL_ARCHIVING, gettext_noop("Sets the amount of time to wait before forcing a " diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index bc12071af6e..b002c5ff027 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -69,6 +69,8 @@ extern PGDLLIMPORT bool enable_parallel_append; extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; extern PGDLLIMPORT bool enable_async_append; +extern PGDLLIMPORT bool enable_seqscan_prefetch; +extern PGDLLIMPORT int seqscan_prefetch_buffers; extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/include/storage/md.h b/src/include/storage/md.h index ffffa40db71..34fc0f2c0ff 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -30,6 +30,7 @@ extern void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +extern void md_reset_prefetch(SMgrRelation reln); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void mdwrite(SMgrRelation reln, ForkNumber forknum, diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 6e9fdd72367..bd957785f1b 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -119,6 +119,7 @@ typedef struct f_smgr BlockNumber nblocks); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_reset_prefetch) (SMgrRelation reln); void (*smgr_start_unlogged_build) (SMgrRelation reln); void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); void (*smgr_end_unlogged_build) (SMgrRelation reln); @@ -158,6 +159,7 @@ extern void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +extern void smgr_reset_prefetch(SMgrRelation reln); extern void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 579b861d84f..a18be0e689f 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -8,13 +8,13 @@ -- but even a trivial check of count(*) will exercise the normal code path -- through the SRF. select count(*) >= 0 as ok from pg_available_extension_versions; - ok + ok ---- t (1 row) select count(*) >= 0 as ok from pg_available_extensions; - ok + ok ---- t (1 row) @@ -23,27 +23,27 @@ select count(*) >= 0 as ok from pg_available_extensions; -- we test only the existence and basic condition of TopMemoryContext. select name, ident, parent, level, total_bytes >= free_bytes from pg_backend_memory_contexts where level = 0; - name | ident | parent | level | ?column? + name | ident | parent | level | ?column? ------------------+-------+--------+-------+---------- TopMemoryContext | | | 0 | t (1 row) -- At introduction, pg_config had 23 entries; it may grow select count(*) > 20 as ok from pg_config; - ok + ok ---- t (1 row) -- We expect no cursors in this test; see also portals.sql select count(*) = 0 as ok from pg_cursors; - ok + ok ---- t (1 row) select count(*) >= 0 as ok from pg_file_settings; - ok + ok ---- t (1 row) @@ -51,7 +51,7 @@ select count(*) >= 0 as ok from pg_file_settings; -- There will surely be at least one rule, with no errors. select count(*) > 0 as ok, count(*) FILTER (WHERE error IS NOT NULL) = 0 AS no_err from pg_hba_file_rules; - ok | no_err + ok | no_err ----+-------- t | t (1 row) @@ -59,49 +59,49 @@ select count(*) > 0 as ok, count(*) FILTER (WHERE error IS NOT NULL) = 0 AS no_e -- There may be no rules, and there should be no errors. select count(*) >= 0 as ok, count(*) FILTER (WHERE error IS NOT NULL) = 0 AS no_err from pg_ident_file_mappings; - ok | no_err + ok | no_err ----+-------- t | t (1 row) -- There will surely be at least one active lock select count(*) > 0 as ok from pg_locks; - ok + ok ---- t (1 row) -- We expect no prepared statements in this test; see also prepare.sql select count(*) = 0 as ok from pg_prepared_statements; - ok + ok ---- t (1 row) -- See also prepared_xacts.sql select count(*) >= 0 as ok from pg_prepared_xacts; - ok + ok ---- t (1 row) -- There will surely be at least one SLRU cache select count(*) > 0 as ok from pg_stat_slru; - ok + ok ---- t (1 row) -- There must be only one record select count(*) = 1 as ok from pg_stat_wal; - ok + ok ---- t (1 row) -- We expect no walreceiver running in this test select count(*) = 0 as ok from pg_stat_wal_receiver; - ok + ok ---- t (1 row) @@ -109,7 +109,7 @@ select count(*) = 0 as ok from pg_stat_wal_receiver; -- This is to record the prevailing planner enable_foo settings during -- a regression test run. select name, setting from pg_settings where name like 'enable%'; - name | setting + name | setting --------------------------------+--------- enable_async_append | on enable_bitmapscan | on @@ -129,9 +129,10 @@ select name, setting from pg_settings where name like 'enable%'; enable_partitionwise_aggregate | off enable_partitionwise_join | off enable_seqscan | on + enable_seqscan_prefetch | on enable_sort | on enable_tidscan | on -(20 rows) +(21 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail @@ -140,13 +141,13 @@ select name, setting from pg_settings where name like 'enable%'; -- (At the time of writing, the actual counts are around 38 because of -- zones using fractional GMT offsets, so this is a pretty loose test.) select count(distinct utc_offset) >= 24 as ok from pg_timezone_names; - ok + ok ---- t (1 row) select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; - ok + ok ---- t (1 row) @@ -154,14 +155,14 @@ select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; -- Let's check the non-default timezone abbreviation sets, too set timezone_abbreviations = 'Australia'; select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; - ok + ok ---- t (1 row) set timezone_abbreviations = 'India'; select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; - ok + ok ---- t (1 row) From 54f74addfce244ee21f7e2be0c7f835a3112f8b2 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 16 Sep 2022 11:43:33 +0300 Subject: [PATCH 10/56] Set last written LSN for the created relation (#212) Co-authored-by: Konstantin Knizhnik --- src/backend/catalog/storage.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index d73854cdd44..530bc3b6e1f 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -185,6 +185,7 @@ void log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum) { xl_smgr_create xlrec; + XLogRecPtr lsn; /* * Make an XLOG entry reporting the file creation. @@ -194,7 +195,8 @@ log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum) XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); - XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); + lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); + SetLastWrittenLSNForRelation(lsn, *rnode, forkNum); } /* From fb19b2cd396d359aa9da84a1adbd303fcb4417a7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 12 Sep 2022 22:33:59 +0300 Subject: [PATCH 11/56] Use normal install program to install server headers. Commit a7032690f9 replaced $(INSTALL) with plain "cp" for installing the server header files. It sped up "make install" significantly, because the old logic called $(INSTALL) separately for every header file, whereas plain "cp" could copy all the files in one command. However, we have long since made it a requirement that $(INSTALL) can also install multiple files in one command, see commit f1c5247563. Switch back to $(INSTALL). Discussion: https://www.postgresql.org/message-id/200503252305.j2PN52m23610%40candle.pha.pa.us Discussion: https://www.postgresql.org/message-id/2415283.1641852217%40sss.pgh.pa.us --- src/include/Makefile | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/include/Makefile b/src/include/Makefile index 5f257a958c8..ac9ccf71d19 100644 --- a/src/include/Makefile +++ b/src/include/Makefile @@ -47,22 +47,15 @@ install: all installdirs $(INSTALL_DATA) utils/errcodes.h '$(DESTDIR)$(includedir_server)/utils' $(INSTALL_DATA) utils/fmgroids.h '$(DESTDIR)$(includedir_server)/utils' $(INSTALL_DATA) utils/fmgrprotos.h '$(DESTDIR)$(includedir_server)/utils' -# We don't use INSTALL_DATA for performance reasons --- there are a lot of files -# (in fact, we have to take some pains to avoid overlength shell commands here) - cp $(srcdir)/*.h '$(DESTDIR)$(includedir_server)'/ + $(INSTALL_DATA) $(srcdir)/*.h '$(DESTDIR)$(includedir_server)' for dir in $(SUBDIRS); do \ - cp $(srcdir)/$$dir/*.h '$(DESTDIR)$(includedir_server)'/$$dir/ || exit; \ + $(INSTALL_DATA) $(srcdir)/$$dir/*.h '$(DESTDIR)$(includedir_server)'/$$dir || exit; \ done ifeq ($(vpath_build),yes) for file in catalog/schemapg.h catalog/system_fk_info.h catalog/pg_*_d.h parser/gram.h storage/lwlocknames.h utils/probes.h; do \ - cp $$file '$(DESTDIR)$(includedir_server)'/$$file || exit; \ + $(INSTALL_DATA) $$file '$(DESTDIR)$(includedir_server)'/$$file || exit; \ done endif - cd '$(DESTDIR)$(includedir_server)' && chmod $(INSTALL_DATA_MODE) *.h - for dir in $(SUBDIRS); do \ - cd '$(DESTDIR)$(includedir_server)'/$$dir || exit; \ - chmod $(INSTALL_DATA_MODE) *.h || exit; \ - done installdirs: $(MKDIR_P) '$(DESTDIR)$(includedir)/libpq' '$(DESTDIR)$(includedir_internal)/libpq' From 507917810e1b5d525caa1dba7f8ae37d4bc2565e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 12 Sep 2022 21:44:48 +0300 Subject: [PATCH 12/56] Update expected output for sysviews test because of changed default value of enable_seqscan_prefetch --- src/test/regress/expected/sysviews.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index a18be0e689f..861d37ed407 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -129,7 +129,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_partitionwise_aggregate | off enable_partitionwise_join | off enable_seqscan | on - enable_seqscan_prefetch | on + enable_seqscan_prefetch | off enable_sort | on enable_tidscan | on (21 rows) From 95f69562f433444584e9019bc0dbbfbfeb1e4259 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 20 Sep 2022 10:14:27 +0300 Subject: [PATCH 13/56] Undo diasming VM check warning in vacuumlazy.c (#214) --- src/backend/access/heap/vacuumlazy.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 1af07d135d7..2d859777bb5 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1159,10 +1159,7 @@ lazy_scan_heap(LVRelState *vacrel) else if (all_visible_according_to_vm && !PageIsAllVisible(page) && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) { - /* ZENITH-XXX: all visible hint is not wal-logged - * FIXME: Replay visibilitymap changes in pageserver - */ - elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", + elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", vacrel->relname, blkno); visibilitymap_clear(vacrel->rel, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS); From 561af182f2bfbbae537ff16a33c7bfcb9655a56d Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 15:35:03 +0300 Subject: [PATCH 14/56] Set Neon-specific FMGR_ABI_EXTRA to support only extensions that were built against Neon PostgreSQL --- src/include/pg_config_manual.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 8d2e3e3a57d..b0e5b6be896 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -57,7 +57,7 @@ * version. Example: "ACME Postgres/1.2". Note that the string will appear * in a user-facing error message if an ABI mismatch is detected. */ -#define FMGR_ABI_EXTRA "PostgreSQL" +#define FMGR_ABI_EXTRA "Neon Postgres" /* * Maximum number of columns in an index. There is little point in making From 4ac76c3419cf92c77410d6a7b15c7c6c1edc4ad0 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 15:37:44 +0300 Subject: [PATCH 15/56] Don't use newline in PG_VERSION file. Neon generates PG_VERSION files in one format - just major version number without newline. Be consistent with it --- src/backend/commands/dbcommands.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 061be47f3a5..bd139e65e2a 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -466,8 +466,8 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo) * Note that we don't have to copy this from the source database; there's * only one legal value. */ - sprintf(buf, "%s\n", PG_MAJORVERSION); - nbytes = strlen(PG_MAJORVERSION) + 1; + sprintf(buf, "%s", PG_MAJORVERSION); + nbytes = strlen(PG_MAJORVERSION); /* If we are not in WAL replay then write the WAL. */ if (!isRedo) From a77147474ae8860acdd0861f74853395d82e273c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 21 Sep 2022 16:56:30 +0300 Subject: [PATCH 16/56] Unset ArchiveRecoveryRequested for Neon code path. No need to perform WAL recovery in Neon Co-authored-by: Konstantin Knizhnik --- src/backend/access/transam/xlogrecovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 0708a9e4c68..e0807db4a7a 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1075,7 +1075,7 @@ readRecoverySignalFile(void) if (standby_signal_file_found) { StandbyModeRequested = true; - ArchiveRecoveryRequested = true; + ArchiveRecoveryRequested = XLogRecPtrIsInvalid(zenithLastRec); /* no need to perform WAL recovery in Neon */ } else if (recovery_signal_file_found) { From faa68d0e4dbc92ef890f9618effcf52ad5cd8145 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 21 Sep 2022 17:56:47 +0300 Subject: [PATCH 17/56] Fix memory leak in ApplyRecord --- src/backend/access/transam/xlogreader.c | 2 +- src/backend/tcop/zenith_wal_redo.c | 6 +++--- src/include/access/xlogreader.h | 2 ++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 9f95e3c1e1c..f148e21ff5a 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -475,7 +475,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg) * Return NULL if there is no space in the decode buffer and allow_oversized * is false, or if memory allocation fails for an oversized buffer. */ -static DecodedXLogRecord * +DecodedXLogRecord * XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized) { size_t required_space = DecodeXLogRecordRequiredSpace(xl_tot_len); diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c index 190eeb71fb9..5578e506077 100644 --- a/src/backend/tcop/zenith_wal_redo.c +++ b/src/backend/tcop/zenith_wal_redo.c @@ -589,9 +589,7 @@ ApplyRecord(StringInfo input_message) XLogBeginRead(reader_state, lsn); reader_state->ReadRecPtr = lsn; - //FIXME Should we use XLogReadRecordAlloc instead? - decoded = (DecodedXLogRecord *) - palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len)); + decoded = (DecodedXLogRecord *)XLogReadRecordAlloc(reader_state, record->xl_tot_len, true); if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) elog(ERROR, "failed to decode WAL record: %s", errormsg); @@ -648,6 +646,8 @@ ApplyRecord(StringInfo input_message) elog(TRACE, "applied WAL record with LSN %X/%X", (uint32) (lsn >> 32), (uint32) lsn); + if (decoded && decoded->oversized) + pfree(decoded); } /* diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index fe3dea47eae..8206baf6caf 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -443,5 +443,7 @@ extern bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer); +extern DecodedXLogRecord * +XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized); #endif /* XLOGREADER_H */ From 808318e946edb6cbf7c55f8f3282bed7399fe031 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 5 Oct 2022 11:43:21 +0300 Subject: [PATCH 18/56] Rebase to Stamp 15.0 --- src/backend/storage/buffer/bufmgr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 464196d803c..8fbe65742a2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3775,7 +3775,8 @@ RelationCopyStorageUsingBuffer(RelFileNode srcnode, * relation before starting to copy block by block. */ memset(buf.data, 0, BLCKSZ); - smgrextend(smgropen(dstnode, InvalidBackendId), forkNum, nblocks - 1, + smgrextend(smgropen(dstnode, InvalidBackendId, permanent ? RELPERSISTENCE_PERMANENT + :RELPERSISTENCE_UNLOGGED), forkNum, nblocks - 1, buf.data, true); /* This is a bulk operation, so use buffer access strategies. */ From 52da928a0169b6f0a0fb8619812ae633ed95802b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 20 Oct 2022 20:01:27 +0300 Subject: [PATCH 19/56] Pin pages with speculative insert tuples to prevent their reconstruction because spec_token is not wal logged (#223) * Pin pages with speculative insert tuples to prevent their reconstruction because spec_token is not wal logged refer ##2587 * Update src/backend/access/heap/heapam.c Co-authored-by: Heikki Linnakangas Co-authored-by: Heikki Linnakangas --- src/backend/access/heap/heapam.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 3aeab9dd335..251b1e2829b 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2195,7 +2195,18 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, END_CRIT_SECTION(); - UnlockReleaseBuffer(buffer); + if (options & HEAP_INSERT_SPECULATIVE) + { + /* + * NEON: speculative token is not stored in WAL, so if the page is evicted + * from the buffer cache, the token will be lost. To prevent that, we keep the + * buffer pinned. It will be unpinned in heapam_tuple_finish/abort_speculative. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + else + UnlockReleaseBuffer(buffer); + if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); @@ -5841,6 +5852,7 @@ heap_finish_speculative(Relation relation, ItemPointer tid) END_CRIT_SECTION(); + ReleaseBuffer(buffer); /* NEON: release buffer pinned by heap_insert */ UnlockReleaseBuffer(buffer); } @@ -5913,6 +5925,16 @@ heap_abort_speculative(Relation relation, ItemPointer tid) elog(ERROR, "attempted to kill a non-speculative tuple"); Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + /* + * NEON: release buffer pinned by heap_insert + * + * This function is also used on the toast tuples of an aborted speculative + * insertion. For those, there is no token on the tuple, and we didn' t keep + * the pin. + */ + if (HeapTupleHeaderIsSpeculative(tp.t_data)) + ReleaseBuffer(buffer); + /* * No need to check for serializable conflicts here. There is never a * need for a combo CID, either. No need to extract replica identity, or @@ -9127,7 +9149,7 @@ heap_xlog_insert(XLogReaderState *record) XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); ItemPointerSetBlockNumber(&target_tid, blkno); - ItemPointerSetOffsetNumber(&target_tid, (xlrec->flags & XLH_INSERT_IS_SPECULATIVE) ? SpecTokenOffsetNumber : xlrec->offnum); + ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); /* * The visibility map may need to be fixed even if the heap page is From 534b38a0c78454ed6b0b8b9741c260214731a002 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 24 Oct 2022 12:14:01 +0300 Subject: [PATCH 20/56] Fix shared memory initialization for last written LSN cache (#226) * Fix shared memory initialization for last written LSN cache Replace (from,till) with (from,n_blocks) for SetLastWrittenLSNForBlockRange function * Fast exit from SetLastWrittenLSNForBlockRange for n_blocks == 0 --- src/backend/access/transam/xlog.c | 17 ++++++++++------- src/include/access/xlog.h | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 07466bc19d7..1efdbf09f3a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4475,7 +4475,7 @@ XLOGShmemInit(void) XLogCtl = (XLogCtlData *) - ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); + ShmemInitStruct("XLOG Ctl", XLOGCtlShmemSize(), &foundXLog); { static HASHCTL info; @@ -6156,9 +6156,9 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions. */ void -SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber till) +SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) { - if (lsn == InvalidXLogRecPtr) + if (lsn == InvalidXLogRecPtr || n_blocks == 0) return; LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); @@ -6173,12 +6173,15 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for BufferTag key; bool found; BlockNumber bucket; + BlockNumber start_bucket; /* inclusive */ + BlockNumber end_bucket; /* exclusive */ + + start_bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET; + end_bucket = (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET; key.rnode = rnode; key.forkNum = forknum; - for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET; - bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET; - bucket++) + for (bucket = start_bucket; bucket < end_bucket; bucket++) { key.blockNum = bucket; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); @@ -6216,7 +6219,7 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for void SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) { - SetLastWrittenLSNForBlockRange(lsn, rnode, forknum, blkno, blkno); + SetLastWrittenLSNForBlockRange(lsn, rnode, forknum, blkno, 1); } /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index dc3325f9050..82af40f437e 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -259,7 +259,7 @@ extern XLogRecPtr GetLastImportantRecPtr(void); /* neon specifics */ extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno); -extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber till); +extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks); extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn); extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum); extern XLogRecPtr GetLastWrittenLSN(RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno); From 887fd35e87d3b62461880be5edb22c3bcb421d8e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 28 Oct 2022 10:00:36 +0300 Subject: [PATCH 21/56] Fix upper boundary caculation in the chunks loop in SetLastWrittenLSNForBlockRange (#229) --- src/backend/access/transam/xlog.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1efdbf09f3a..6bfbb896cf7 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6177,7 +6177,8 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for BlockNumber end_bucket; /* exclusive */ start_bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET; - end_bucket = (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET; + end_bucket = from == REL_METADATA_PSEUDO_BLOCKNO + ? start_bucket + 1 : (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET; key.rnode = rnode; key.forkNum = forknum; From e336bc9e1b90ff90bcdbb4c82f8b6e84cde9e804 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 28 Oct 2022 16:27:23 +0400 Subject: [PATCH 22/56] Stamp XLP_FIRST_IS_CONTRECORD only if we start writing with page offset. Without this patch, on bootstrap XLP_FIRST_IS_CONTRECORD has been always put on header of a page where WAL writing continues. This confuses WAL decoding on safekeepers, making it think decoding starts in the middle of a record, leading to 2022-08-12T17:48:13.816665Z ERROR {tid=37}: query handler for 'START_WAL_PUSH postgresql://no_user:@localhost:15050' failed: failed to run ReceiveWalConn Caused by: 0: failed to process ProposerAcceptorMessage 1: invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD at 0/2CF8000 Rebase of a1af529d08497f for v14. --- src/backend/access/transam/xlogrecovery.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index e0807db4a7a..2b3fe682a5e 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1544,17 +1544,23 @@ FinishWalRecovery(void) } else { - int len = endOfLog % XLOG_BLCKSZ; - char *page = palloc0(len); - XLogRecPtr pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ); + int offs = endOfLog % XLOG_BLCKSZ; + char *page = palloc0(offs); + XLogRecPtr pageBeginPtr = endOfLog - offs; + int lastPageSize = ((pageBeginPtr % wal_segment_size) == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD; XLogPageHeader xlogPageHdr = (XLogPageHeader) (page); xlogPageHdr->xlp_pageaddr = pageBeginPtr; xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC; xlogPageHdr->xlp_tli = recoveryTargetTLI; - xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD; // FIXME - xlogPageHdr->xlp_rem_len = (endOfLog % XLOG_BLCKSZ) - SizeOfXLogShortPHD; + /* + * If we start writing with offset from page beginning, pretend in + * page header there is a record ending where actual data will + * start. + */ + xlogPageHdr->xlp_rem_len = offs - lastPageSize; + xlogPageHdr->xlp_info = (xlogPageHdr->xlp_rem_len > 0) ? XLP_FIRST_IS_CONTRECORD : 0; readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size); result->lastPageBeginPtr = pageBeginPtr; From a1066f4ad8d4af60a353dd0a1e270797eade033a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 31 Oct 2022 01:11:35 +0100 Subject: [PATCH 23/56] Move walredo process code under pgxn in the main 'neon' repository. - Refactor the way the WalProposerMain function is called when started with --sync-safekeepers. The postgres binary now explicitly loads the 'neon.so' library and calls the WalProposerMain in it. This is simpler than the global function callback "hook" we previously used. - Move the WAL redo process code to a new library, neon_walredo.so, and use the same mechanism as for --sync-safekeepers to call the WalRedoMain function, when launched with --walredo argument. - Also move the seccomp code to neon_walredo.so library. I kept the configure check in the postgres side for now, though. --- src/backend/main/main.c | 43 +- src/backend/postmaster/Makefile | 5 - src/backend/postmaster/bgworker.c | 1 - src/backend/postmaster/postmaster.c | 1 - src/backend/postmaster/seccomp.c | 249 -------- src/backend/replication/Makefile | 3 +- src/backend/replication/walpropcompat.c | 95 --- src/backend/replication/walsender.c | 1 - src/backend/storage/buffer/buf_init.c | 8 + src/backend/tcop/Makefile | 2 - src/backend/tcop/zenith_wal_redo.c | 814 ------------------------ src/backend/utils/misc/guc.c | 1 - src/include/miscadmin.h | 2 +- src/include/postmaster/seccomp.h | 26 - src/include/replication/walpropshim.h | 19 - src/include/storage/buf_internals.h | 2 + src/include/tcop/tcopprot.h | 4 - 17 files changed, 50 insertions(+), 1226 deletions(-) delete mode 100644 src/backend/postmaster/seccomp.c delete mode 100644 src/backend/replication/walpropcompat.c delete mode 100644 src/backend/tcop/zenith_wal_redo.c delete mode 100644 src/include/postmaster/seccomp.h delete mode 100644 src/include/replication/walpropshim.h diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 37fa0548cff..2b908cb3cc6 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -37,9 +37,9 @@ #include "bootstrap/bootstrap.h" #include "common/username.h" +#include "miscadmin.h" #include "port/atomics.h" #include "postmaster/postmaster.h" -#include "replication/walpropshim.h" #include "storage/spin.h" #include "tcop/tcopprot.h" #include "utils/help_config.h" @@ -56,6 +56,41 @@ static void init_locale(const char *categoryname, int category, const char *loca static void help(const char *progname); static void check_root(const char *progname); +typedef int (*MainFunc) (int argc, char *argv[]); + +static int +CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[]) +{ + MainFunc main_func; + + /* + * Perform just enough initialization that we can load external libraries + */ + InitStandaloneProcess(argv[0]); + + SetProcessingMode(InitProcessing); + + /* + * Set default values for command-line options. + */ + InitializeGUCOptions(); + + /* Acquire configuration parameters */ + if (!SelectConfigFiles(NULL, progname)) + exit(1); + + /* + * Imitate we are early in bootstrap loading shared_preload_libraries; + * neon extension sets PGC_POSTMASTER gucs requiring this. + */ + process_shared_preload_libraries_in_progress = true; + + main_func = load_external_function(library_name, main_func_name, true, NULL); + + process_shared_preload_libraries_in_progress = false; + + return main_func(argc, argv); +} /* * Any Postgres server process begins execution here. @@ -200,11 +235,9 @@ main(int argc, char *argv[]) PostgresSingleUserMain(argc, argv, strdup(get_user_name_or_exit(progname))); else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0) - WalRedoMain(argc, argv, - NULL, /* no dbname */ - strdup(get_user_name_or_exit(progname))); /* does not return */ + CallExtMain("neon_walredo", "WalRedoMain", argc, argv); else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0) - WalProposerSync(argc, argv); + CallExtMain("neon", "WalProposerSync", argc, argv); else PostmasterMain(argc, argv); /* the functions above should not return */ diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index a4f49350690..3a794e54d60 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -27,9 +27,4 @@ OBJS = \ syslogger.o \ walwriter.o -ifeq ($(with_libseccomp),yes) -OBJS += \ - seccomp.o -endif - include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 6afcc10e4fc..8dd7d64630c 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -22,7 +22,6 @@ #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" -#include "replication/walpropshim.h" #include "storage/dsm.h" #include "storage/ipc.h" #include "storage/latch.h" diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 5933bc7ff49..892d42c63ee 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -119,7 +119,6 @@ #include "postmaster/syslogger.h" #include "replication/logicallauncher.h" #include "replication/walsender.h" -#include "replication/walpropshim.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c deleted file mode 100644 index 3ac21b02983..00000000000 --- a/src/backend/postmaster/seccomp.c +++ /dev/null @@ -1,249 +0,0 @@ -/*------------------------------------------------------------------------- - * - * seccomp.c - * Secure Computing BPF API wrapper. - * - * Pageserver delegates complex WAL decoding duties to postgres, - * which means that the latter might fall victim to carefully designed - * malicious WAL records and start doing harmful things to the system. - * To prevent this, it has been decided to limit possible interactions - * with the outside world using the Secure Computing BPF mode. - * - * We use this mode to disable all syscalls not in the allowlist. This - * approach has its pros & cons: - * - * - We have to carefully handpick and maintain the set of syscalls - * required for the WAL redo process. Core dumps help with that. - * The method of trial and error seems to work reasonably well, - * but it would be nice to find a proper way to "prove" that - * the set in question is both necessary and sufficient. - * - * - Once we enter the seccomp bpf mode, it's impossible to lift those - * restrictions (otherwise, what kind of "protection" would that be?). - * Thus, we have to either enable extra syscalls for the clean shutdown, - * or exit the process immediately via _exit() instead of proc_exit(). - * - * - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom - * facility to deal with the forbidden syscalls? If we'd like to embed - * a startup security test, we should go with the latter; In that - * case, which one of the following options is preferable? - * - * * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP. - * Provide a common signal handler with a static switch to override - * its behavior for the test case. This would undermine the whole - * purpose of such protection, so we'd have to go further and remap - * the memory backing the switch as readonly, then ban mprotect(). - * Ugly and fragile, to say the least. - * - * * Yet again, catch the denied syscalls using SCMP_ACT_TRAP. - * Provide 2 different signal handlers: one for a test case, - * another for the main processing loop. Install the first one, - * enable seccomp, perform the test, switch to the second one, - * finally ban sigaction(), presto! - * - * * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the - * test, then ban it altogether with another filter. The downside - * of this solution is that we don't actually check that - * SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works. - * - * Either approach seems to require two eBPF filter programs, - * which is unfortunate: the man page tells this is uncommon. - * Maybe I (@funbringer) am missing something, though; I encourage - * any reader to get familiar with it and scrutinize my conclusions. - * - * TODOs and ideas in no particular order: - * - * - Do something about mmap() in musl's malloc(). - * Definitely not a priority if we don't care about musl. - * - * - See if we can untangle PG's shutdown sequence (involving unlink()): - * - * * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode. - * * Investigate chroot() or mount namespaces for better FS isolation. - * * (Per Heikki) Simply call _exit(), no big deal. - * * Come up with a better idea? - * - * - Make use of seccomp's argument inspection (for what?). - * Unfortunately, it views all syscall arguments as scalars, - * so it won't work for e.g. string comparison in unlink(). - * - * - Benchmark with bpf jit on/off, try seccomp_syscall_priority(). - * - * - Test against various linux distros & glibc versions. - * I suspect that certain libc functions might involve slightly - * different syscalls, e.g. select/pselect6/pselect6_time64/whatever. - * - * - Test on any arch other than amd64 to see if it works there. - * - * - * IDENTIFICATION - * src/backend/postmaster/seccomp.c - * - *------------------------------------------------------------------------- - */ - -#include "postgres.h" -#include "miscadmin.h" -#include "postmaster/seccomp.h" - -#include -#include - -static void die(int code, const char *str); - -static bool seccomp_test_sighandler_done = false; -static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt); -static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt); - -static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action); - -void seccomp_load_rules(PgSeccompRule *rules, int count) -{ - struct sigaction action = { .sa_flags = SA_SIGINFO }; - PgSeccompRule rule; - long fd; - - /* - * Install a test signal handler. - * XXX: pqsignal() is too restrictive for our purposes, - * since we'd like to examine the contents of siginfo_t. - */ - action.sa_sigaction = seccomp_test_sighandler; - if (sigaction(SIGSYS, &action, NULL) != 0) - ereport(FATAL, - (errcode(ERRCODE_SYSTEM_ERROR), - errmsg("seccomp: could not install test SIGSYS handler"))); - - /* - * First, check that open of a well-known file works. - * XXX: We use raw syscall() to call the very open(). - */ - fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); - if (seccomp_test_sighandler_done) - ereport(FATAL, - (errcode(ERRCODE_SYSTEM_ERROR), - errmsg("seccomp: signal handler test flag was set unexpectedly"))); - if (fd < 0) - ereport(FATAL, - (errcode(ERRCODE_SYSTEM_ERROR), - errmsg("seccomp: could not open /dev/null for seccomp testing: %m"))); - close((int) fd); - - /* Set a trap on open() to test seccomp bpf */ - rule = PG_SCMP(open, SCMP_ACT_TRAP); - if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0) - ereport(FATAL, - (errcode(ERRCODE_SYSTEM_ERROR), - errmsg("seccomp: could not load test trap"))); - - /* Finally, check that open() now raises SIGSYS */ - (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); - if (!seccomp_test_sighandler_done) - ereport(FATAL, - (errcode(ERRCODE_SYSTEM_ERROR), - errmsg("seccomp: SIGSYS handler doesn't seem to work"))); - - /* Now that everything seems to work, install a proper handler */ - action.sa_sigaction = seccomp_deny_sighandler; - if (sigaction(SIGSYS, &action, NULL) != 0) - ereport(FATAL, - (errcode(ERRCODE_SYSTEM_ERROR), - errmsg("seccomp: could not install SIGSYS handler"))); - - /* If this succeeds, any syscall not in the list will crash the process */ - if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0) - ereport(FATAL, - (errcode(ERRCODE_SYSTEM_ERROR), - errmsg("seccomp: could not enter seccomp mode"))); -} - -/* - * Enter seccomp mode with a BPF filter that will only allow - * certain syscalls to proceed. - */ -static int -do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action) -{ - scmp_filter_ctx ctx; - int rc = -1; - - /* Create a context with a default action for syscalls not in the list */ - if ((ctx = seccomp_init(def_action)) == NULL) - goto cleanup; - - for (int i = 0; i < count; i++) - { - PgSeccompRule *rule = &rules[i]; - if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0) - goto cleanup; - } - - /* Try building & loading the program into the kernel */ - if ((rc = seccomp_load(ctx)) != 0) - goto cleanup; - -cleanup: - /* - * We don't need the context anymore regardless of the result, - * since either we failed or the eBPF program has already been - * loaded into the linux kernel. - */ - seccomp_release(ctx); - return rc; -} - -static void -die(int code, const char *str) -{ - /* work around gcc ignoring that it shouldn't warn on (void) result being unused */ - ssize_t _unused pg_attribute_unused(); - /* Best effort write to stderr */ - _unused = write(fileno(stderr), str, strlen(str)); - - /* XXX: we don't want to run any atexit callbacks */ - _exit(code); -} - -static void -seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) -{ -#define DIE_PREFIX "seccomp test signal handler: " - - /* Check that this signal handler is used only for a single test case */ - if (seccomp_test_sighandler_done) - die(1, DIE_PREFIX "test handler should only be used for 1 test\n"); - seccomp_test_sighandler_done = true; - - if (signum != SIGSYS) - die(1, DIE_PREFIX "bad signal number\n"); - - /* TODO: maybe somehow extract the hardcoded syscall number */ - if (info->si_syscall != SCMP_SYS(open)) - die(1, DIE_PREFIX "bad syscall number\n"); - -#undef DIE_PREFIX -} - -static void -seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused()) -{ - /* - * Unfortunately, we can't use seccomp_syscall_resolve_num_arch() - * to resolve the syscall's name, since it calls strdup() - * under the hood (wtf!). - */ - char buffer[128]; - (void)snprintf(buffer, lengthof(buffer), - "---------------------------------------\n" - "seccomp: bad syscall %d\n" - "---------------------------------------\n", - info->si_syscall); - - /* - * Instead of silently crashing the process with - * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS, - * we'd like to receive a real SIGSYS to print the - * message and *then* immediately exit. - */ - die(1, buffer); -} diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile index 938b1858127..2bffac58c0d 100644 --- a/src/backend/replication/Makefile +++ b/src/backend/replication/Makefile @@ -22,8 +22,7 @@ OBJS = \ syncrep_gram.o \ walreceiver.o \ walreceiverfuncs.o \ - walsender.o \ - walpropcompat.o + walsender.o SUBDIRS = logical diff --git a/src/backend/replication/walpropcompat.c b/src/backend/replication/walpropcompat.c deleted file mode 100644 index 49711c45b4c..00000000000 --- a/src/backend/replication/walpropcompat.c +++ /dev/null @@ -1,95 +0,0 @@ -#include "postgres.h" - -#include -#include -#include - -#include "access/xlog.h" -#include "access/xlog_internal.h" -#include "access/xlogdefs.h" -#include "miscadmin.h" -#include "postmaster/bgworker.h" -#include "postmaster/postmaster.h" -#include "storage/fd.h" -#include "utils/guc.h" -#include "replication/walpropshim.h" - -bool syncSafekeepers = false; -void (*WalProposerInit) (XLogRecPtr flushRecPtr, uint64 systemId) = NULL; -void (*WalProposerStart) (void) = NULL; - -/* - * Entry point for `postgres --sync-safekeepers`. - */ -void -WalProposerSync(int argc, char *argv[]) -{ - struct stat stat_buf; - - syncSafekeepers = true; - - InitStandaloneProcess(argv[0]); - - SetProcessingMode(InitProcessing); - - /* - * Set default values for command-line options. - */ - InitializeGUCOptions(); - - /* Acquire configuration parameters */ - if (!SelectConfigFiles(NULL, progname)) - exit(1); - - /* - * Imitate we are early in bootstrap loading shared_preload_libraries; - * zenith extension sets PGC_POSTMASTER gucs requiring this. - */ - process_shared_preload_libraries_in_progress = true; - - /* - * Initialize postmaster_alive_fds as WaitEventSet checks them. - * - * Copied from InitPostmasterDeathWatchHandle() - */ - if (pipe(postmaster_alive_fds) < 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg_internal("could not create pipe to monitor postmaster death: %m"))); - if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) - ereport(FATAL, - (errcode_for_socket_access(), - errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); - - ChangeToDataDir(); - - /* Create pg_wal directory, if it doesn't exist */ - if (stat(XLOGDIR, &stat_buf) != 0) - { - ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR))); - if (MakePGDirectory(XLOGDIR) < 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create directory \"%s\": %m", - XLOGDIR))); - exit(1); - } - } - - load_file("neon", false); - - if (NULL == WalProposerInit) - elog(ERROR, "Neon failed to register WalProposerInit"); - - if (NULL == WalProposerStart) - elog(ERROR, "Neon failed to register WalProposerStart"); - - WalProposerInit(0, 0); - - process_shared_preload_libraries_in_progress = false; - - BackgroundWorkerUnblockSignals(); - - WalProposerStart(); -} diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 667f94e888f..3fb70ae8712 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -75,7 +75,6 @@ #include "replication/slot.h" #include "replication/snapbuild.h" #include "replication/syncrep.h" -#include "replication/walpropshim.h" #include "replication/walreceiver.h" #include "replication/walsender.h" #include "replication/walsender_private.h" diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 2862e9e412c..bd772e3ff05 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -24,6 +24,14 @@ ConditionVariableMinimallyPadded *BufferIOCVArray; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; +/* + * Buffer with target WAL redo page. + * We must not evict this page from the buffer pool, but we cannot just keep it pinned because + * some WAL redo functions expect the page to not be pinned. So we have a special check in + * localbuf.c to prevent this buffer from being evicted. + */ +Buffer wal_redo_buffer; +bool am_wal_redo_postgres = false; /* * Data Structures: diff --git a/src/backend/tcop/Makefile b/src/backend/tcop/Makefile index 84f027436a4..f662a7dd1cf 100644 --- a/src/backend/tcop/Makefile +++ b/src/backend/tcop/Makefile @@ -20,6 +20,4 @@ OBJS = \ pquery.o \ utility.o -OBJS += zenith_wal_redo.o - include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c deleted file mode 100644 index 5578e506077..00000000000 --- a/src/backend/tcop/zenith_wal_redo.c +++ /dev/null @@ -1,814 +0,0 @@ -/*------------------------------------------------------------------------- - * - * zenith_wal_redo.c - * Entry point for WAL redo helper - * - * - * This file contains an alternative main() function for the 'postgres' - * binary. In the special mode, we go into a special mode that's similar - * to the single user mode. We don't launch postmaster or any auxiliary - * processes. Instead, we wait for command from 'stdin', and respond to - * 'stdout'. - * - * The protocol through stdin/stdout is loosely based on the libpq protocol. - * The process accepts messages through stdin, and each message has the format: - * - * char msgtype; - * int32 length; // length of message including 'length' but excluding - * // 'msgtype', in network byte order - * - * - * There are three message types: - * - * BeginRedoForBlock ('B'): Prepare for WAL replay for given block - * PushPage ('P'): Copy a page image (in the payload) to buffer cache - * ApplyRecord ('A'): Apply a WAL record (in the payload) - * GetPage ('G'): Return a page image from buffer cache. - * - * Currently, you only get a response to GetPage requests; the response is - * simply a 8k page, without any headers. Errors are logged to stderr. - * - * FIXME: - * - this currently requires a valid PGDATA, and creates a lock file there - * like a normal postmaster. There's no fundamental reason for that, though. - * - should have EndRedoForBlock, and flush page cache, to allow using this - * mechanism for more than one block without restarting the process. - * - * - * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/tcop/zenith_wal_redo.c - * - *------------------------------------------------------------------------- - */ - -#include "postgres.h" - -#include -#include -#include -#include -#include -#ifdef HAVE_SYS_SELECT_H -#include -#endif -#ifdef HAVE_SYS_RESOURCE_H -#include -#include -#endif - -#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__) -#define MALLOC_NO_MMAP -#include -#endif - -#ifndef HAVE_GETRUSAGE -#include "rusagestub.h" -#endif - -#include "access/xlog.h" -#include "access/xlog_internal.h" -#include "access/xlogutils.h" -#include "access/xlogrecovery.h" -#include "catalog/pg_class.h" -#include "libpq/libpq.h" -#include "libpq/pqformat.h" -#include "miscadmin.h" -#include "postmaster/postmaster.h" -#include "postmaster/seccomp.h" -#include "storage/buf_internals.h" -#include "storage/bufmgr.h" -#include "storage/ipc.h" -#include "storage/proc.h" -#include "storage/smgr.h" -#include "tcop/tcopprot.h" -#include "utils/memutils.h" -#include "utils/ps_status.h" - -static int ReadRedoCommand(StringInfo inBuf); -static void BeginRedoForBlock(StringInfo input_message); -static void PushPage(StringInfo input_message); -static void ApplyRecord(StringInfo input_message); -static void apply_error_callback(void *arg); -static bool redo_block_filter(XLogReaderState *record, uint8 block_id); -static void GetPage(StringInfo input_message); -static ssize_t buffered_read(void *buf, size_t count); - -static BufferTag target_redo_tag; - -Buffer wal_redo_buffer; -bool am_wal_redo_postgres; - -static XLogReaderState *reader_state; - -#define TRACE DEBUG5 - -#ifdef HAVE_LIBSECCOMP -static void -enter_seccomp_mode(void) -{ - PgSeccompRule syscalls[] = - { - /* Hard requirements */ - PG_SCMP_ALLOW(exit_group), - PG_SCMP_ALLOW(pselect6), - PG_SCMP_ALLOW(read), - PG_SCMP_ALLOW(select), - PG_SCMP_ALLOW(write), - - /* Memory allocation */ - PG_SCMP_ALLOW(brk), -#ifndef MALLOC_NO_MMAP - /* TODO: musl doesn't have mallopt */ - PG_SCMP_ALLOW(mmap), - PG_SCMP_ALLOW(munmap), -#endif - /* - * getpid() is called on assertion failure, in ExceptionalCondition. - * It's not really needed, but seems pointless to hide it either. The - * system call unlikely to expose a kernel vulnerability, and the PID - * is stored in MyProcPid anyway. - */ - PG_SCMP_ALLOW(getpid), - - /* Enable those for a proper shutdown. - PG_SCMP_ALLOW(munmap), - PG_SCMP_ALLOW(shmctl), - PG_SCMP_ALLOW(shmdt), - PG_SCMP_ALLOW(unlink), // shm_unlink - */ - }; - -#ifdef MALLOC_NO_MMAP - /* Ask glibc not to use mmap() */ - mallopt(M_MMAP_MAX, 0); -#endif - - seccomp_load_rules(syscalls, lengthof(syscalls)); -} -#endif - -/* ---------------------------------------------------------------- - * FIXME comment - * PostgresMain - * postgres main loop -- all backends, interactive or otherwise start here - * - * argc/argv are the command line arguments to be used. (When being forked - * by the postmaster, these are not the original argv array of the process.) - * dbname is the name of the database to connect to, or NULL if the database - * name should be extracted from the command line arguments or defaulted. - * username is the PostgreSQL user name to be used for the session. - * ---------------------------------------------------------------- - */ -void -WalRedoMain(int argc, char *argv[], - const char *dbname, - const char *username) -{ - int firstchar; - StringInfoData input_message; -#ifdef HAVE_LIBSECCOMP - bool enable_seccomp; -#endif - - /* Initialize startup process environment if necessary. */ - InitStandaloneProcess(argv[0]); - - am_wal_redo_postgres = true; - - /* - * Set default values for command-line options. - */ - InitializeGUCOptions(); - - /* - * WAL redo does not need a large number of buffers. And speed of - * DropRelFileNodeAllLocalBuffers() is proportional to the number of - * buffers. So let's keep it small (default value is 1024) - */ - num_temp_buffers = 4; - - /* - * Parse command-line options. - * TODO - */ - //process_postgres_switches(argc, argv, PGC_POSTMASTER, &dbname); - - /* Acquire configuration parameters */ - if (!SelectConfigFiles(NULL, progname)) - proc_exit(1); - - /* - * Validate we have been given a reasonable-looking DataDir and change into it. - */ - checkDataDir(); - ChangeToDataDir(); - - /* - * Create lockfile for data directory. - */ - CreateDataDirLockFile(false); - - /* read control file (error checking and contains config ) */ - LocalProcessControlFile(false); - - /* - * process any libraries that should be preloaded at postmaster start - */ - process_shared_preload_libraries(); - - /* Initialize MaxBackends (if under postmaster, was done already) */ - InitializeMaxBackends(); - - /* - * Give preloaded libraries a chance to request additional shared memory. - */ - process_shmem_requests(); - - /* - * Now that loadable modules have had their chance to request additional - * shared memory, determine the value of any runtime-computed GUCs that - * depend on the amount of shared memory required. - */ - InitializeShmemGUCs(); - - /* - * Now that modules have been loaded, we can process any custom resource - * managers specified in the wal_consistency_checking GUC. - */ - InitializeWalConsistencyChecking(); - - CreateSharedMemoryAndSemaphores(); - - /* - * Remember stand-alone backend startup time,roughly at the same point - * during startup that postmaster does so. - */ - PgStartTime = GetCurrentTimestamp(); - - /* - * Create a per-backend PGPROC struct in shared memory. We must do this - * before we can use LWLocks. - */ - InitProcess(); - - SetProcessingMode(InitProcessing); - - /* Early initialization */ - BaseInit(); - - SetProcessingMode(NormalProcessing); - - /* Redo routines won't work if we're not "in recovery" */ - InRecovery = true; - - /* - * Create the memory context we will use in the main loop. - * - * MessageContext is reset once per iteration of the main loop, ie, upon - * completion of processing of each command message from the client. - */ - MessageContext = AllocSetContextCreate(TopMemoryContext, - "MessageContext", - ALLOCSET_DEFAULT_SIZES); - - /* we need a ResourceOwner to hold buffer pins */ - Assert(CurrentResourceOwner == NULL); - CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo"); - - /* Initialize resource managers */ - for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) - { - if (RmgrTable[rmid].rm_startup != NULL) - RmgrTable[rmid].rm_startup(); - } - reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL); - -#ifdef HAVE_LIBSECCOMP - /* We prefer opt-out to opt-in for greater security */ - enable_seccomp = true; - for (int i = 1; i < argc; i++) - if (strcmp(argv[i], "--disable-seccomp") == 0) - enable_seccomp = false; - - /* - * We deliberately delay the transition to the seccomp mode - * until it's time to enter the main processing loop; - * else we'd have to add a lot more syscalls to the allowlist. - */ - if (enable_seccomp) - enter_seccomp_mode(); -#endif - - /* - * Main processing loop - */ - MemoryContextSwitchTo(MessageContext); - initStringInfo(&input_message); - - for (;;) - { - /* Release memory left over from prior query cycle. */ - resetStringInfo(&input_message); - - set_ps_display("idle"); - - /* - * (3) read a command (loop blocks here) - */ - firstchar = ReadRedoCommand(&input_message); - switch (firstchar) - { - case 'B': /* BeginRedoForBlock */ - BeginRedoForBlock(&input_message); - break; - - case 'P': /* PushPage */ - PushPage(&input_message); - break; - - case 'A': /* ApplyRecord */ - ApplyRecord(&input_message); - break; - - case 'G': /* GetPage */ - GetPage(&input_message); - break; - - /* - * EOF means we're done. Perform normal shutdown. - */ - case EOF: - ereport(LOG, - (errmsg("received EOF on stdin, shutting down"))); - -#ifdef HAVE_LIBSECCOMP - /* - * Skip the shutdown sequence, leaving some garbage behind. - * Hopefully, postgres will clean it up in the next run. - * This way we don't have to enable extra syscalls, which is nice. - * See enter_seccomp_mode() above. - */ - if (enable_seccomp) - _exit(0); -#endif - /* - * NOTE: if you are tempted to add more code here, DON'T! - * Whatever you had in mind to do should be set up as an - * on_proc_exit or on_shmem_exit callback, instead. Otherwise - * it will fail to be called during other backend-shutdown - * scenarios. - */ - proc_exit(0); - - default: - ereport(FATAL, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("invalid frontend message type %d", - firstchar))); - } - } /* end of input-reading loop */ -} - -/* - * Some debug function that may be handy for now. - */ -pg_attribute_unused() -static char * -pprint_buffer(char *data, int len) -{ - StringInfoData s; - initStringInfo(&s); - appendStringInfo(&s, "\n"); - for (int i = 0; i < len; i++) { - - appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) ); - if (i % 32 == 31) { - appendStringInfo(&s, "\n"); - } - } - appendStringInfo(&s, "\n"); - - return s.data; -} - -/* ---------------------------------------------------------------- - * routines to obtain user input - * ---------------------------------------------------------------- - */ - -/* - * Read next command from the client. - * - * the string entered by the user is placed in its parameter inBuf, - * and we act like a Q message was received. - * - * EOF is returned if end-of-file input is seen; time to shut down. - * ---------------- - */ -static int -ReadRedoCommand(StringInfo inBuf) -{ - ssize_t ret; - char hdr[1 + sizeof(int32)]; - int qtype; - int32 len; - - /* Read message type and message length */ - ret = buffered_read(hdr, sizeof(hdr)); - if (ret != sizeof(hdr)) - { - if (ret == 0) - return EOF; - else if (ret < 0) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("could not read message header: %m"))); - else - ereport(ERROR, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("unexpected EOF"))); - } - - qtype = hdr[0]; - memcpy(&len, &hdr[1], sizeof(int32)); - len = pg_ntoh32(len); - - if (len < 4) - ereport(ERROR, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("invalid message length"))); - - len -= 4; /* discount length itself */ - - /* Read the message payload */ - enlargeStringInfo(inBuf, len); - ret = buffered_read(inBuf->data, len); - if (ret != len) - { - if (ret < 0) - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("could not read message: %m"))); - else - ereport(ERROR, - (errcode(ERRCODE_PROTOCOL_VIOLATION), - errmsg("unexpected EOF"))); - } - inBuf->len = len; - inBuf->data[len] = '\0'; - - return qtype; -} - -/* - * Prepare for WAL replay on given block - */ -static void -BeginRedoForBlock(StringInfo input_message) -{ - RelFileNode rnode; - ForkNumber forknum; - BlockNumber blknum; - SMgrRelation reln; - - /* - * message format: - * - * spcNode - * dbNode - * relNode - * ForkNumber - * BlockNumber - */ - forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); - blknum = pq_getmsgint(input_message, 4); - - INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum); - - elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u", - target_redo_tag.rnode.spcNode, - target_redo_tag.rnode.dbNode, - target_redo_tag.rnode.relNode, - target_redo_tag.forkNum, - target_redo_tag.blockNum); - - reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); - if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber || - reln->smgr_cached_nblocks[forknum] < blknum + 1) - { - reln->smgr_cached_nblocks[forknum] = blknum + 1; - } -} - -/* - * Receive a page given by the client, and put it into buffer cache. - */ -static void -PushPage(StringInfo input_message) -{ - RelFileNode rnode; - ForkNumber forknum; - BlockNumber blknum; - const char *content; - Buffer buf; - Page page; - - /* - * message format: - * - * spcNode - * dbNode - * relNode - * ForkNumber - * BlockNumber - * 8k page content - */ - forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); - blknum = pq_getmsgint(input_message, 4); - content = pq_getmsgbytes(input_message, BLCKSZ); - - //FIXME assume relpersistence permanent. Is it always true? - buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_ZERO_AND_LOCK, NULL, true); - wal_redo_buffer = buf; - page = BufferGetPage(buf); - - memcpy(page, content, BLCKSZ); - MarkBufferDirty(buf); /* pro forma */ - UnlockReleaseBuffer(buf); -} - -/* - * Receive a WAL record, and apply it. - * - * All the pages should be loaded into the buffer cache by PushPage calls already. - */ -static void -ApplyRecord(StringInfo input_message) -{ - char *errormsg; - XLogRecPtr lsn; - XLogRecord *record; - int nleft; - ErrorContextCallback errcallback; - DecodedXLogRecord *decoded = NULL; - - /* - * message format: - * - * LSN (the *end* of the record) - * record - */ - lsn = pq_getmsgint64(input_message); - - smgrinit(); /* reset inmem smgr state */ - - nleft = input_message->len - input_message->cursor; - /* note: the input must be aligned here */ - record = (XLogRecord *) pq_getmsgbytes(input_message, nleft); - - if (record->xl_tot_len != nleft) - elog(ERROR, "mismatch between record (%d) and message size (%d)", - record->xl_tot_len, nleft); - - /* Setup error traceback support for ereport() */ - errcallback.callback = apply_error_callback; - errcallback.arg = (void *) reader_state; - errcallback.previous = error_context_stack; - error_context_stack = &errcallback; - - XLogBeginRead(reader_state, lsn); - reader_state->ReadRecPtr = lsn; - - decoded = (DecodedXLogRecord *)XLogReadRecordAlloc(reader_state, record->xl_tot_len, true); - - if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) - elog(ERROR, "failed to decode WAL record: %s", errormsg); - else - { - /* Record the location of the next record. */ - decoded->next_lsn = reader_state->NextRecPtr; - - /* - * If it's in the decode buffer, mark the decode buffer space as - * occupied. - */ - if (!decoded->oversized) - { - /* The new decode buffer head must be MAXALIGNed. */ - Assert(decoded->size == MAXALIGN(decoded->size)); - if ((char *) decoded == reader_state->decode_buffer) - reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size; - else - reader_state->decode_buffer_tail += decoded->size; - } - - /* Insert it into the queue of decoded records. */ - Assert(reader_state->decode_queue_tail != decoded); - if (reader_state->decode_queue_tail) - reader_state->decode_queue_tail->next = decoded; - reader_state->decode_queue_tail = decoded; - if (!reader_state->decode_queue_head) - reader_state->decode_queue_head = decoded; - - - /* - * Update the pointers to the beginning and one-past-the-end of this - * record, again for the benefit of historical code that expected the - * decoder to track this rather than accessing these fields of the record - * itself. - */ - reader_state->record = reader_state->decode_queue_head; - reader_state->ReadRecPtr = reader_state->record->lsn; - reader_state->EndRecPtr = reader_state->record->next_lsn; - - } - - - /* Ignore any other blocks than the ones the caller is interested in */ - redo_read_buffer_filter = redo_block_filter; - - RmgrTable[record->xl_rmid].rm_redo(reader_state); - - redo_read_buffer_filter = NULL; - - /* Pop the error context stack */ - error_context_stack = errcallback.previous; - - elog(TRACE, "applied WAL record with LSN %X/%X", - (uint32) (lsn >> 32), (uint32) lsn); - if (decoded && decoded->oversized) - pfree(decoded); -} - -/* - * Error context callback for errors occurring during ApplyRecord - */ -static void -apply_error_callback(void *arg) -{ - XLogReaderState *record = (XLogReaderState *) arg; - StringInfoData buf; - - - initStringInfo(&buf); - xlog_outdesc(&buf, record); - - /* translator: %s is a WAL record description */ - errcontext("WAL redo at %X/%X for %s", - LSN_FORMAT_ARGS(record->ReadRecPtr), - buf.data); - - - pfree(buf.data); -} - -static bool -redo_block_filter(XLogReaderState *record, uint8 block_id) -{ - BufferTag target_tag; - - XLogRecGetBlockTag(record, block_id, - &target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum); - - /* - * Can a WAL redo function ever access a relation other than the one that - * it modifies? I don't see why it would. - */ - if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode)) - elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u", - target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum); - - /* - * If this block isn't one we are currently restoring, then return 'true' - * so that this gets ignored - */ - return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag); -} - -/* - * Get a page image back from buffer cache. - * - * After applying some records. - */ -static void -GetPage(StringInfo input_message) -{ - RelFileNode rnode; - ForkNumber forknum; - BlockNumber blknum; - Buffer buf; - Page page; - int tot_written; - - /* - * message format: - * - * spcNode - * dbNode - * relNode - * ForkNumber - * BlockNumber - */ - forknum = pq_getmsgbyte(input_message); - rnode.spcNode = pq_getmsgint(input_message, 4); - rnode.dbNode = pq_getmsgint(input_message, 4); - rnode.relNode = pq_getmsgint(input_message, 4); - blknum = pq_getmsgint(input_message, 4); - - /* FIXME: check that we got a BeginRedoForBlock message or this earlier */ - - - //FIXME assume relpersistence permanent. Is it always true? - buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_NORMAL, NULL, true); - page = BufferGetPage(buf); - /* single thread, so don't bother locking the page */ - - /* Response: Page content */ - tot_written = 0; - do { - ssize_t rc; - - rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written); - if (rc < 0) { - /* If interrupted by signal, just retry */ - if (errno == EINTR) - continue; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write to stdout: %m"))); - } - tot_written += rc; - } while (tot_written < BLCKSZ); - - ReleaseBuffer(buf); - DropRelFileNodeAllLocalBuffers(rnode); - - elog(TRACE, "Page sent back for block %u", blknum); -} - - -/* Buffer used by buffered_read() */ -static char stdin_buf[16 * 1024]; -static size_t stdin_len = 0; /* # of bytes in buffer */ -static size_t stdin_ptr = 0; /* # of bytes already consumed */ - -/* - * Like read() on stdin, but buffered. - * - * We cannot use libc's buffered fread(), because it uses syscalls that we - * have disabled with seccomp(). Depending on the platform, it can call - * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat' - * seems problematic because it allows interrogating files by path name. - * - * The return value is the number of bytes read. On error, -1 is returned, and - * errno is set appropriately. Unlike read(), this fills the buffer completely - * unless an error happens or EOF is reached. - */ -static ssize_t -buffered_read(void *buf, size_t count) -{ - char *dst = buf; - - while (count > 0) - { - size_t nthis; - - if (stdin_ptr == stdin_len) - { - ssize_t ret; - - ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf)); - if (ret < 0) - { - /* don't do anything here that could set 'errno' */ - return ret; - } - if (ret == 0) - { - /* EOF */ - break; - } - stdin_len = (size_t) ret; - stdin_ptr = 0; - } - nthis = Min(stdin_len - stdin_ptr, count); - - memcpy(dst, &stdin_buf[stdin_ptr], nthis); - - stdin_ptr += nthis; - count -= nthis; - dst += nthis; - } - - return (dst - (char *) buf); -} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index d55b5dbff04..3e1115b1b16 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -86,7 +86,6 @@ #include "replication/syncrep.h" #include "replication/walreceiver.h" #include "replication/walsender.h" -#include "replication/walpropshim.h" #include "storage/bufmgr.h" #include "storage/dsm_impl.h" #include "storage/fd.h" diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 3a33e404e4a..026d7e64786 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -496,7 +496,7 @@ extern PGDLLIMPORT shmem_request_hook_type shmem_request_hook; /* in executor/nodeHash.c */ extern size_t get_hash_memory_limit(void); -/* in src/backend/tcop/zenith_wal_redo.c */ +/* in storage/buffer/buf_init.c */ extern bool am_wal_redo_postgres; #endif /* MISCADMIN_H */ diff --git a/src/include/postmaster/seccomp.h b/src/include/postmaster/seccomp.h deleted file mode 100644 index 1613d34bd47..00000000000 --- a/src/include/postmaster/seccomp.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef PG_SECCOMP_H -#define PG_SECCOMP_H - -#include "postgres.h" - -#ifdef HAVE_LIBSECCOMP -#include -#endif - -typedef struct { - int psr_syscall; /* syscall number */ - uint32 psr_action; /* libseccomp action, e.g. SCMP_ACT_ALLOW */ -} PgSeccompRule; - -#define PG_SCMP(syscall, action) \ - (PgSeccompRule) { \ - .psr_syscall = SCMP_SYS(syscall), \ - .psr_action = (action), \ - } - -#define PG_SCMP_ALLOW(syscall) \ - PG_SCMP(syscall, SCMP_ACT_ALLOW) - -void seccomp_load_rules(PgSeccompRule *syscalls, int count); - -#endif /* PG_SECCOMP_H */ diff --git a/src/include/replication/walpropshim.h b/src/include/replication/walpropshim.h deleted file mode 100644 index 07757580cc9..00000000000 --- a/src/include/replication/walpropshim.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * walpropshim.h - * various hooks for the walproposer component of the Neon extension. - */ - -#ifndef __WALPROPOSER_H__ -#define __WALPROPOSER_H__ - -/* - * Set to true only in standalone run of `postgres --sync-safekeepers`. - * See also the top comment in contrib/neon/walproposer.c - */ -extern PGDLLIMPORT bool syncSafekeepers; -extern PGDLLIMPORT void (*WalProposerInit) (XLogRecPtr flushRecPtr, uint64 systemId); -extern PGDLLIMPORT void (*WalProposerStart) (void); - -void WalProposerSync(int argc, char *argv[]); - -#endif diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index a17e7b28a53..14bf8ca3df4 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -278,6 +278,8 @@ typedef struct WritebackContext extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; extern PGDLLIMPORT WritebackContext BackendWritebackContext; +extern Buffer wal_redo_buffer; + /* in localbuf.c */ extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors; diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h index d29674cd93d..70d9dab25b8 100644 --- a/src/include/tcop/tcopprot.h +++ b/src/include/tcop/tcopprot.h @@ -94,8 +94,4 @@ extern bool set_plan_disabling_options(const char *arg, GucContext context, GucSource source); extern const char *get_stats_option_name(const char *arg); -extern void WalRedoMain(int argc, char *argv[], - const char *dbname, - const char *username); - #endif /* TCOPPROT_H */ From f34afc95e43d9a7d0ca3adef06cde7a1d6d41621 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 31 Oct 2022 14:00:00 +0100 Subject: [PATCH 24/56] Misc cleanup, mostly to reduce unnecessary differences with upstream. Fix indentation, remove unused definitions, resolve some FIXMEs. --- src/backend/access/spgist/spginsert.c | 1 + src/backend/access/transam/xlog.c | 3 +- src/backend/access/transam/xloginsert.c | 3 -- src/backend/access/transam/xlogprefetcher.c | 3 +- src/backend/access/transam/xlogreader.c | 2 - src/backend/access/transam/xlogrecovery.c | 2 - src/backend/access/transam/xlogutils.c | 1 - src/backend/replication/walsender.c | 44 ++++++++++----------- src/backend/storage/buffer/bufmgr.c | 4 +- src/backend/storage/page/bufpage.c | 1 + src/backend/utils/misc/guc.c | 2 - src/include/access/xlog.h | 2 - src/include/access/xlogreader.h | 2 +- src/include/replication/walsender.h | 15 +++---- src/include/storage/bufmgr.h | 2 - 15 files changed, 39 insertions(+), 48 deletions(-) diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 8262a2e4e67..6628958c522 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -107,6 +107,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS); MarkBufferDirty(nullbuffer); + END_CRIT_SECTION(); UnlockReleaseBuffer(metabuffer); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6bfbb896cf7..1f8ba364827 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4740,7 +4740,6 @@ BootStrapXLOG(void) ReadControlFile(); } - static char * str_time(pg_time_t tnow) { @@ -6490,7 +6489,7 @@ LogCheckpointEnd(bool restartpoint) average_sync_time = 0; if (CheckpointStats.ckpt_sync_rels > 0) average_sync_time = CheckpointStats.ckpt_agg_sync_time / - CheckpointStats.ckpt_sync_rels; + CheckpointStats.ckpt_sync_rels; average_msecs = (long) ((average_sync_time + 999) / 1000); if (restartpoint) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 185c8f59b55..030e4de7123 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -149,9 +149,6 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, static bool XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length, char *dest, uint16 *dlen); -/* Timeout in milliseconds for delaying WAL inserts to avoid WAL overflow */ -#define MB ((XLogRecPtr)1024*1024) - /* * Begin constructing a WAL record. This must be called before the * XLogRegister* functions and XLogInsert(). diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 96404405b93..88b470d6f23 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -720,8 +720,9 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * We could try to have a fast path for repeated references to the * same relation (with some scheme to handle invalidations * safely), but for now we'll call smgropen() every time. + * + * Only permanent relations are WAL-logged, so RELPERSISTENCE_PERMANENT. */ - //FIXME what relpersistence should we use here? reln = smgropen(block->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index f148e21ff5a..d39f746a177 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1842,7 +1842,6 @@ DecodeXLogRecord(XLogReaderState *state, } else blk->hole_length = BLCKSZ - blk->bimg_len; - datatotal += blk->bimg_len; /* @@ -1988,7 +1987,6 @@ DecodeXLogRecord(XLogReaderState *state, /* Report the actual size we used. */ decoded->size = MAXALIGN(out - (char *) decoded); - Assert(DecodeXLogRecordRequiredSpace(record->xl_tot_len) >= decoded->size); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 2b3fe682a5e..d986384ff02 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1471,8 +1471,6 @@ FinishWalRecovery(void) * An important side-effect of this is to load the last page into * xlogreader. The caller uses it to initialize the WAL for writing. */ - - if (!InRecovery) { lastRec = CheckPointLoc; diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 53ac78b4f69..1d4e9992956 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -378,7 +378,6 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, { if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) { - //FIXME assume relpersistence permanent. Is it always true? *buf = ReadBufferWithoutRelcache(rnode, forknum, blkno, mode, NULL, true); return BLK_DONE; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 3fb70ae8712..d06582346dd 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2879,7 +2879,7 @@ XLogSendPhysical(void) * * In theory we could make XLogFlush() record a time in shmem whenever WAL * is flushed and we could get that time as well as the LSN when we call - * GetFlushRecPtr(NULL) above (and likewise for the cascading standby + * GetFlushRecPtr() above (and likewise for the cascading standby * equivalent), but rather than putting any new code into the hot WAL path * it seems good enough to capture the time here. We should reach this * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that @@ -2974,8 +2974,8 @@ XLogSendPhysical(void) Assert(nbytes <= MAX_SEND_SIZE); /* - * OK to read and send the slice. - */ + * OK to read and send the slice. + */ if (output_message.data) resetStringInfo(&output_message); else @@ -2987,20 +2987,20 @@ XLogSendPhysical(void) pq_sendint64(&output_message, 0); /* sendtime, filled in last */ /* - * Read the log directly into the output buffer to avoid extra memcpy - * calls. - */ + * Read the log directly into the output buffer to avoid extra memcpy + * calls. + */ enlargeStringInfo(&output_message, nbytes); retry: if (!WALRead(xlogreader, - &output_message.data[output_message.len], - startptr, - nbytes, - xlogreader->seg.ws_tli, /* Pass the current TLI because - * only WalSndSegmentOpen controls - * whether new TLI is needed. */ - &errinfo)) + &output_message.data[output_message.len], + startptr, + nbytes, + xlogreader->seg.ws_tli, /* Pass the current TLI because + * only WalSndSegmentOpen controls + * whether new TLI is needed. */ + &errinfo)) WALReadRaiseError(&errinfo); /* See logical_read_xlog_page(). */ @@ -3008,11 +3008,11 @@ XLogSendPhysical(void) CheckXLogRemoved(segno, xlogreader->seg.ws_tli); /* - * During recovery, the currently-open WAL file might be replaced with the - * file of the same name retrieved from archive. So we always need to - * check what we read was valid after reading into the buffer. If it's - * invalid, we try to open and read the file again. - */ + * During recovery, the currently-open WAL file might be replaced with the + * file of the same name retrieved from archive. So we always need to + * check what we read was valid after reading into the buffer. If it's + * invalid, we try to open and read the file again. + */ if (am_cascading_walsender) { WalSnd *walsnd = MyWalSnd; @@ -3035,12 +3035,12 @@ XLogSendPhysical(void) output_message.data[output_message.len] = '\0'; /* - * Fill the send timestamp last, so that it is taken as late as possible. - */ + * Fill the send timestamp last, so that it is taken as late as possible. + */ resetStringInfo(&tmpbuf); pq_sendint64(&tmpbuf, GetCurrentTimestamp()); memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], - tmpbuf.data, sizeof(int64)); + tmpbuf.data, sizeof(int64)); pq_putmessage_noblock('d', output_message.data, output_message.len); @@ -3078,7 +3078,7 @@ XLogSendLogical(void) /* * We'll use the current flush point to determine whether we've caught up. * This variable is static in order to cache it across calls. Caching is - * helpful because GetFlushRecPtr(NULL) needs to acquire a heavily-contended + * helpful because GetFlushRecPtr() needs to acquire a heavily-contended * spinlock. */ static XLogRecPtr flushPtr = InvalidXLogRecPtr; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 8fbe65742a2..3b83b109f15 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -58,6 +58,7 @@ #include "utils/timestamp.h" #include "replication/walsender.h" + /* Note: these two macros only work on shared buffers, not local ones! */ #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) @@ -806,13 +807,14 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, bool hit; SMgrRelation smgr = smgropen(rnode, InvalidBackendId, - RELPERSISTENCE_PERMANENT); + RELPERSISTENCE_PERMANENT); return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, forkNum, blockNum, mode, strategy, &hit); } + /* * ReadBuffer_common -- common logic for all ReadBuffer variants * diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index da63605e9e1..a3d367db511 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -427,6 +427,7 @@ PageRestoreTempPage(Page tempPage, Page oldPage) pageSize = PageGetPageSize(tempPage); memcpy((char *) oldPage, (char *) tempPage, pageSize); + pfree(tempPage); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 3e1115b1b16..cea86f3e6ad 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -193,7 +193,6 @@ static int syslog_facility = 0; static void assign_syslog_facility(int newval, void *extra); static void assign_syslog_ident(const char *newval, void *extra); static void assign_session_replication_role(int newval, void *extra); - static bool check_temp_buffers(int *newval, void **extra, GucSource source); static bool check_bonjour(bool *newval, void **extra, GucSource source); static bool check_ssl(bool *newval, void **extra, GucSource source); @@ -12310,7 +12309,6 @@ assign_session_replication_role(int newval, void *extra) ResetPlanCache(); } - static bool check_temp_buffers(int *newval, void **extra, GucSource source) { diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 82af40f437e..727f674b8bf 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -26,10 +26,8 @@ #define SYNC_METHOD_OPEN_DSYNC 4 /* for O_DSYNC */ extern PGDLLIMPORT int sync_method; - extern PGDLLIMPORT XLogRecPtr ProcLastRecPtr; extern PGDLLIMPORT XLogRecPtr XactLastRecEnd; - extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; /* diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 8206baf6caf..835428fafd9 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -216,7 +216,7 @@ struct XLogReaderState /* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */ XLogRecPtr overwrittenRecPtr; - /* Disable validation to allow dumpng corrupt WAL */ + /* Disable validation to allow dumping corrupt WAL */ bool skip_page_validation; bool skip_invalid_records; bool skip_lsn_checks; diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index 08c3ff62603..96ed5ede4b5 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -51,7 +51,8 @@ extern void WalSndRqstFileReload(void); /* * Hook to check for WAL receiving backpressure. - * Return value in microseconds */ + * Return value in microseconds + */ extern uint64 (*delay_backend_us)(void); /* expose these so that they can be reused by the neon walproposer extension */ @@ -60,12 +61,12 @@ extern TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); extern void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr, XLogRecPtr applyPtr, TimestampTz replyTime, bool replyRequested); -void PhysicalConfirmReceivedLocation(XLogRecPtr lsn); -void ProcessStandbyHSFeedback(TimestampTz replyTime, - TransactionId feedbackXmin, - uint32 feedbackEpoch, - TransactionId feedbackCatalogXmin, - uint32 feedbackCatalogEpoch); +extern void PhysicalConfirmReceivedLocation(XLogRecPtr lsn); +extern void ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch); /* * Remember that we want to wakeup walsenders later diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index d6a1c6f26fd..708884aa3db 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -230,8 +230,6 @@ extern void BufferGetTag(Buffer buffer, RelFileNode *rnode, extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std); -extern void MarkBufferPermanent(Buffer buffer); - extern void UnlockBuffers(void); extern void LockBuffer(Buffer buffer, int mode); extern bool ConditionalLockBuffer(Buffer buffer); From 87c5bb6438005dfbd02a32e1ca83df2345aa684a Mon Sep 17 00:00:00 2001 From: MMeent Date: Mon, 7 Nov 2022 14:40:32 +0100 Subject: [PATCH 25/56] Optimize prefetch patterns in both heap seqscan and vacuum scans. (#228) Previously, we called PrefetchBuffer [NBlkScanned * seqscan_prefetch_buffers] times in each of those situations, but now only NBlkScanned. In addition, the prefetch mechanism for the vacuum scans is now based on blocks instead of tuples - improving the efficiency. --- src/backend/access/heap/heapam.c | 27 ++++++++--- src/backend/access/heap/vacuumlazy.c | 70 +++++++++++++++++++++++----- src/backend/storage/smgr/smgr.c | 10 ---- src/include/storage/smgr.h | 1 - 4 files changed, 79 insertions(+), 29 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 251b1e2829b..ffc384b4a40 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -399,19 +399,32 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) */ CHECK_FOR_INTERRUPTS(); - /* Prefetch next block */ - if (enable_seqscan_prefetch) + /* Prefetch up to seqscan_prefetch_buffers blocks ahead */ + if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0) { - int prefetch_limit = seqscan_prefetch_buffers; + uint32 prefetch_limit = seqscan_prefetch_buffers; + BlockNumber prefetch_start = page; ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata; + if (pbscanwork != NULL && pbscanwork->phsw_chunk_remaining < prefetch_limit) prefetch_limit = pbscanwork->phsw_chunk_remaining; - if (page + prefetch_limit >= scan->rs_nblocks) - prefetch_limit = scan->rs_nblocks - page - 1; - smgr_reset_prefetch(RelationGetSmgr(scan->rs_base.rs_rd)); + /* + * If this is the first page, initiate prefetch of pages page..page + n. + * On each subsequent call, prefetch the next page that we haven't + * prefetched yet, at page + n. + */ + if (scan->rs_startblock != page) + { + prefetch_start = (page + prefetch_limit - 1) % scan->rs_nblocks; + prefetch_limit = 1; + } + else + prefetch_start = page; + for (int i = 1; i <= prefetch_limit; i++) - PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, page+i); + PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, + (prefetch_start+i) % scan->rs_nblocks); } /* read page using selected strategy */ diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 2d859777bb5..2f99c446e04 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -973,12 +973,27 @@ lazy_scan_heap(LVRelState *vacrel) */ visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); - if (enable_seqscan_prefetch) + if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0) { - int prefetch_limit = Min(rel_pages - blkno - 1, seqscan_prefetch_buffers); - smgr_reset_prefetch(RelationGetSmgr(vacrel->rel)); - for (int i = 1; i <= prefetch_limit; i++) - PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, blkno+i); + /* + * If we're starting the scan, we need to prefetch the first N pages. + * If not, we need to only prefetch page blkno+n. + */ + if (blkno == 0) + { + int prefetch_limit = Min(rel_pages - blkno - 1, + seqscan_prefetch_buffers); + + for (int i = 1; i <= prefetch_limit; i++) + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, blkno+i); + } + else + { + /* No need to prefetch past the end of the relation */ + if (blkno + seqscan_prefetch_buffers < rel_pages) + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, + blkno + seqscan_prefetch_buffers); + } } /* Finished preparatory checks. Actually scan the page. */ @@ -2404,7 +2419,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) static void lazy_vacuum_heap_rel(LVRelState *vacrel) { - int index; + int index, + pindex; BlockNumber vacuumed_pages; Buffer vmbuffer = InvalidBuffer; LVSavedErrInfo saved_err_info; @@ -2425,6 +2441,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuumed_pages = 0; index = 0; + pindex = 0; while (index < vacrel->dead_items->num_items) { BlockNumber tblk; @@ -2435,13 +2452,44 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); - if (enable_seqscan_prefetch) + + if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0) { - int prefetch_limit = Min(vacrel->dead_items->num_items - index - 1, seqscan_prefetch_buffers); - smgr_reset_prefetch(RelationGetSmgr(vacrel->rel)); - for (int i = 1; i <= prefetch_limit; i++) - PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&vacrel->dead_items->items[index + i])); + /* + * If we're just starting out, prefetch N consecutive blocks. + * If not, only the next 1 block + */ + if (index == 0) + { + int prefetch_limit = Min(vacrel->dead_items->num_items - 1, + Min(vacrel->rel_pages, + seqscan_prefetch_buffers)); + BlockNumber prev_prefetch = 0; + + while (++pindex < vacrel->dead_items->num_items && + prefetch_limit > 0) + { + ItemPointer ptr = &vacrel->dead_items->items[pindex]; + if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) + { + prev_prefetch = ItemPointerGetBlockNumber(ptr); + prefetch_limit -= 1; + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); + } + } + } + else + { + BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + while (pindex < vacrel->dead_items->num_items) + { + if (toPrefetch != ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex])) + break; + } + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch); + } } + vacrel->blkno = tblk; buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL, vacrel->bstrategy); diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index d8a9ffae34b..87260673bc8 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -38,7 +38,6 @@ static const f_smgr smgr_md = { .smgr_unlink = mdunlink, .smgr_extend = mdextend, .smgr_prefetch = mdprefetch, - .smgr_reset_prefetch = md_reset_prefetch, .smgr_read = mdread, .smgr_write = mdwrite, .smgr_writeback = mdwriteback, @@ -532,15 +531,6 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum); } -/* - * smgr_reset_prefetch() -- Cancel all previos prefetch requests - */ -void -smgr_reset_prefetch(SMgrRelation reln) -{ - (*reln->smgr).smgr_reset_prefetch(reln); -} - /* * smgrread() -- read a particular block from a relation into the supplied * buffer. diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index bd957785f1b..cf1492a5f99 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -159,7 +159,6 @@ extern void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); -extern void smgr_reset_prefetch(SMgrRelation reln); extern void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, From ddd669fde381346b80a2c64ae060bed4f2e2b303 Mon Sep 17 00:00:00 2001 From: MMeent Date: Fri, 11 Nov 2022 14:24:16 +0100 Subject: [PATCH 26/56] Fix prefetch issues in parallel scans and vacuum's cleanup scan (#235) Parallel seqscans didn't take their parallelism into account when determining which block to prefetch, and vacuum's cleanup scan didn't correctly determine which blocks would need to be prefetched, and could get into an infinite loop. --- src/backend/access/heap/heapam.c | 74 +++++++++++++++++++++++----- src/backend/access/heap/vacuumlazy.c | 66 ++++++++++++++----------- 2 files changed, 99 insertions(+), 41 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ffc384b4a40..06cb1ce49e9 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -402,29 +402,77 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) /* Prefetch up to seqscan_prefetch_buffers blocks ahead */ if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0) { - uint32 prefetch_limit = seqscan_prefetch_buffers; - BlockNumber prefetch_start = page; + int64 nblocks; + int64 rel_scan_start; + int64 rel_scan_end; /* blockno of end of scan (mod scan->rs_nblocks) */ + + int64 prefetch_start; /* start block of prefetch requests this iteration */ + int64 prefetch_end; /* end block of prefetch requests this iteration, if applicable */ ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata; - if (pbscanwork != NULL && pbscanwork->phsw_chunk_remaining < prefetch_limit) - prefetch_limit = pbscanwork->phsw_chunk_remaining; + Assert(seqscan_prefetch_buffers > 0); /* - * If this is the first page, initiate prefetch of pages page..page + n. - * On each subsequent call, prefetch the next page that we haven't - * prefetched yet, at page + n. + * Parallel scans look like repeated sequential table scans for + * prefetching; with a scan start at nalloc + ch_remaining - ch_size */ - if (scan->rs_startblock != page) + if (pbscanwork != NULL) { - prefetch_start = (page + prefetch_limit - 1) % scan->rs_nblocks; - prefetch_limit = 1; + rel_scan_start = (BlockNumber) pbscanwork->phsw_nallocated + 1 + + pbscanwork->phsw_chunk_remaining + - pbscanwork->phsw_chunk_size; + rel_scan_end = Min(pbscanwork->phsw_nallocated + pbscanwork->phsw_chunk_remaining, + scan->rs_nblocks); + nblocks = pbscanwork->phsw_nallocated + pbscanwork->phsw_chunk_remaining; } else + { + rel_scan_start = scan->rs_startblock; + rel_scan_end = scan->rs_startblock + scan->rs_nblocks; + nblocks = scan->rs_nblocks; + } + + Assert(rel_scan_start <= page && page <= rel_scan_end); + + /* + * If this is the first page of this seqscan, initiate prefetch of + * pages page..page + n. On each subsequent call, prefetch the next + * page that we haven't prefetched yet, at page + n. + * If this is the last page of the prefetch, + */ + if (rel_scan_start != page) + { + prefetch_start = (page + seqscan_prefetch_buffers - 1); + + prefetch_end = prefetch_start + 1; + + /* If we've wrapped around, add nblocks to get the block number in the [start, end] range */ + if (page < rel_scan_start) + prefetch_start += nblocks; + } + else + { + /* first block we're fetching, cannot have wrapped around yet */ prefetch_start = page; - for (int i = 1; i <= prefetch_limit; i++) - PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, - (prefetch_start+i) % scan->rs_nblocks); + prefetch_end = rel_scan_end; + } + + /* do not prefetch if the only page we're trying to prefetch is past the end of our scan window */ + if (prefetch_start > rel_scan_end) + prefetch_end = 0; + + if (prefetch_end > prefetch_start + seqscan_prefetch_buffers) + prefetch_end = prefetch_start + seqscan_prefetch_buffers; + + while (prefetch_start < prefetch_end) + { + BlockNumber blckno = (prefetch_start % nblocks); + Assert(blckno < nblocks); + Assert(blckno < INT_MAX); + PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, blckno); + prefetch_start += 1; + } } /* read page using selected strategy */ diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 2f99c446e04..b050efba431 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -848,6 +848,7 @@ lazy_scan_heap(LVRelState *vacrel) BlockNumber rel_pages = vacrel->rel_pages, blkno, next_unskippable_block, + next_prefetch_block, next_failsafe_block = 0, next_fsm_block_to_vacuum = 0; VacDeadItems *dead_items = vacrel->dead_items; @@ -871,6 +872,7 @@ lazy_scan_heap(LVRelState *vacrel) next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0, &next_unskippable_allvis, &skipping_current_range); + next_prefetch_block = 0; for (blkno = 0; blkno < rel_pages; blkno++) { Buffer buf; @@ -976,24 +978,28 @@ lazy_scan_heap(LVRelState *vacrel) if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0) { /* - * If we're starting the scan, we need to prefetch the first N pages. - * If not, we need to only prefetch page blkno+n. + * Prefetch seqscan_prefetch_buffers blocks ahead */ - if (blkno == 0) - { - int prefetch_limit = Min(rel_pages - blkno - 1, - seqscan_prefetch_buffers); + uint32 prefetch_budget = seqscan_prefetch_buffers; - for (int i = 1; i <= prefetch_limit; i++) - PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, blkno+i); - } - else - { - /* No need to prefetch past the end of the relation */ - if (blkno + seqscan_prefetch_buffers < rel_pages) - PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, - blkno + seqscan_prefetch_buffers); - } + /* never trail behind the current scan */ + if (next_prefetch_block < blkno) + next_prefetch_block = blkno; + + /* but only up to the end of the relation */ + if (prefetch_budget > rel_pages - next_prefetch_block) + prefetch_budget = rel_pages - next_prefetch_block; + + /* And only up to seqscan_prefetch_buffers ahead of the current vacuum scan */ + if (next_prefetch_block + prefetch_budget > blkno + seqscan_prefetch_buffers) + prefetch_budget = blkno + seqscan_prefetch_buffers - next_prefetch_block; + + /* And only up to the next unskippable block */ + if (next_prefetch_block + prefetch_budget > next_unskippable_block) + prefetch_budget = next_unskippable_block - next_prefetch_block; + + for (; prefetch_budget-- > 0; next_prefetch_block++) + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, next_prefetch_block); } /* Finished preparatory checks. Actually scan the page. */ @@ -2459,34 +2465,38 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) * If we're just starting out, prefetch N consecutive blocks. * If not, only the next 1 block */ - if (index == 0) + if (pindex == 0) { - int prefetch_limit = Min(vacrel->dead_items->num_items - 1, - Min(vacrel->rel_pages, - seqscan_prefetch_buffers)); - BlockNumber prev_prefetch = 0; + int prefetch_budget = Min(vacrel->dead_items->num_items, + Min(vacrel->rel_pages, + seqscan_prefetch_buffers)); + BlockNumber prev_prefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); while (++pindex < vacrel->dead_items->num_items && - prefetch_limit > 0) + prefetch_budget > 0) { ItemPointer ptr = &vacrel->dead_items->items[pindex]; if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) { prev_prefetch = ItemPointerGetBlockNumber(ptr); - prefetch_limit -= 1; + prefetch_budget -= 1; PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); } } } - else + else if (pindex < vacrel->dead_items->num_items) { - BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); - while (pindex < vacrel->dead_items->num_items) + BlockNumber previous = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + while (++pindex < vacrel->dead_items->num_items) { - if (toPrefetch != ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex])) + BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + if (previous != toPrefetch) + { + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch); break; + } } - PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch); } } From 90c4f24585c0e3780b5cd94e361af9166d47e563 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 15 Nov 2022 13:47:57 +0000 Subject: [PATCH 27/56] Fix expected results for regression tests (#238) --- src/test/regress/expected/sysviews.out | 38 +++++++++++++------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 861d37ed407..dda7da83eaa 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -8,13 +8,13 @@ -- but even a trivial check of count(*) will exercise the normal code path -- through the SRF. select count(*) >= 0 as ok from pg_available_extension_versions; - ok + ok ---- t (1 row) select count(*) >= 0 as ok from pg_available_extensions; - ok + ok ---- t (1 row) @@ -23,27 +23,27 @@ select count(*) >= 0 as ok from pg_available_extensions; -- we test only the existence and basic condition of TopMemoryContext. select name, ident, parent, level, total_bytes >= free_bytes from pg_backend_memory_contexts where level = 0; - name | ident | parent | level | ?column? + name | ident | parent | level | ?column? ------------------+-------+--------+-------+---------- TopMemoryContext | | | 0 | t (1 row) -- At introduction, pg_config had 23 entries; it may grow select count(*) > 20 as ok from pg_config; - ok + ok ---- t (1 row) -- We expect no cursors in this test; see also portals.sql select count(*) = 0 as ok from pg_cursors; - ok + ok ---- t (1 row) select count(*) >= 0 as ok from pg_file_settings; - ok + ok ---- t (1 row) @@ -51,7 +51,7 @@ select count(*) >= 0 as ok from pg_file_settings; -- There will surely be at least one rule, with no errors. select count(*) > 0 as ok, count(*) FILTER (WHERE error IS NOT NULL) = 0 AS no_err from pg_hba_file_rules; - ok | no_err + ok | no_err ----+-------- t | t (1 row) @@ -59,49 +59,49 @@ select count(*) > 0 as ok, count(*) FILTER (WHERE error IS NOT NULL) = 0 AS no_e -- There may be no rules, and there should be no errors. select count(*) >= 0 as ok, count(*) FILTER (WHERE error IS NOT NULL) = 0 AS no_err from pg_ident_file_mappings; - ok | no_err + ok | no_err ----+-------- t | t (1 row) -- There will surely be at least one active lock select count(*) > 0 as ok from pg_locks; - ok + ok ---- t (1 row) -- We expect no prepared statements in this test; see also prepare.sql select count(*) = 0 as ok from pg_prepared_statements; - ok + ok ---- t (1 row) -- See also prepared_xacts.sql select count(*) >= 0 as ok from pg_prepared_xacts; - ok + ok ---- t (1 row) -- There will surely be at least one SLRU cache select count(*) > 0 as ok from pg_stat_slru; - ok + ok ---- t (1 row) -- There must be only one record select count(*) = 1 as ok from pg_stat_wal; - ok + ok ---- t (1 row) -- We expect no walreceiver running in this test select count(*) = 0 as ok from pg_stat_wal_receiver; - ok + ok ---- t (1 row) @@ -109,7 +109,7 @@ select count(*) = 0 as ok from pg_stat_wal_receiver; -- This is to record the prevailing planner enable_foo settings during -- a regression test run. select name, setting from pg_settings where name like 'enable%'; - name | setting + name | setting --------------------------------+--------- enable_async_append | on enable_bitmapscan | on @@ -141,13 +141,13 @@ select name, setting from pg_settings where name like 'enable%'; -- (At the time of writing, the actual counts are around 38 because of -- zones using fractional GMT offsets, so this is a pretty loose test.) select count(distinct utc_offset) >= 24 as ok from pg_timezone_names; - ok + ok ---- t (1 row) select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; - ok + ok ---- t (1 row) @@ -155,14 +155,14 @@ select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; -- Let's check the non-default timezone abbreviation sets, too set timezone_abbreviations = 'Australia'; select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; - ok + ok ---- t (1 row) set timezone_abbreviations = 'India'; select count(distinct utc_offset) >= 24 as ok from pg_timezone_abbrevs; - ok + ok ---- t (1 row) From 890a699d09f76ade46a6aee6846c95bf43c2cc0e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 17 Nov 2022 15:31:22 +0200 Subject: [PATCH 28/56] Use prefetch in pg_prewarm extension (#237) * Use prefetch in pg_prewarm extension * Change prefetch order as suggested in review --- contrib/pg_prewarm/pg_prewarm.c | 10 ++++++++-- contrib/pg_prewarm/pg_prewarm.control | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index caff5c4a80f..6ac382e165c 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -18,6 +18,7 @@ #include "access/relation.h" #include "fmgr.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "storage/bufmgr.h" #include "storage/smgr.h" #include "utils/acl.h" @@ -183,14 +184,19 @@ pg_prewarm(PG_FUNCTION_ARGS) } else if (ptype == PREWARM_BUFFER) { + BlockNumber prefetch_block = first_block; /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { - Buffer buf; - + Buffer buf; + int prefetch_stop = block + Min(last_block - block + 1, seqscan_prefetch_buffers); CHECK_FOR_INTERRUPTS(); + while (prefetch_block < prefetch_stop) + { + PrefetchBuffer(rel, forkNumber, prefetch_block++); + } buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control index 40e3add4810..d40d1a000b7 100644 --- a/contrib/pg_prewarm/pg_prewarm.control +++ b/contrib/pg_prewarm/pg_prewarm.control @@ -3,3 +3,4 @@ comment = 'prewarm relation data' default_version = '1.2' module_pathname = '$libdir/pg_prewarm' relocatable = true +trusted = true From 6fce20cd50c46498cdfe511776fda79fca47b4d6 Mon Sep 17 00:00:00 2001 From: MMeent Date: Wed, 23 Nov 2022 12:55:14 +0100 Subject: [PATCH 29/56] PG15: Prefetch cleanup (#241) * Update prefetch mechanisms: - **Enable enable_seqscan_prefetch by default** - Store prefetch distance in the relevant scan structs - Slow start sequential scan, to accommodate LIMIT clauses. - Replace seqscan_prefetch_buffer with the relations' tablespaces' *_io_concurrency; and drop seqscan_prefetch_buffer as a result. - Clarify enable_seqscan_prefetch GUC description - Fix prefetch in pg_prewarm - Add prefetching to autoprewarm worker - Fix an issue where we'd incorrectly not prefetch data when hitting a table wraparound. The same issue also resulted in assertion failures in debug builds. - Fix parallel scan prefetching - we didn't take into account that parallel scans have scan synchronization, too. --- contrib/pg_prewarm/autoprewarm.c | 35 +++++++++++ contrib/pg_prewarm/pg_prewarm.c | 9 ++- src/backend/access/heap/heapam.c | 86 +++++++++++++++++++------- src/backend/access/heap/vacuumlazy.c | 43 ++++++------- src/backend/optimizer/path/costsize.c | 1 - src/backend/utils/misc/guc.c | 18 +----- src/include/access/heapam.h | 4 ++ src/include/optimizer/cost.h | 1 - src/test/regress/expected/sysviews.out | 2 +- 9 files changed, 135 insertions(+), 64 deletions(-) diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index c0c4f5d9ca7..b0f9c2e235d 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -54,6 +54,7 @@ #include "utils/rel.h" #include "utils/relfilenodemap.h" #include "utils/resowner.h" +#include "utils/spccache.h" #define AUTOPREWARM_FILE "autoprewarm.blocks" @@ -449,10 +450,12 @@ void autoprewarm_database_main(Datum main_arg) { int pos; + int io_concurrency; BlockInfoRecord *block_info; Relation rel = NULL; BlockNumber nblocks = 0; BlockInfoRecord *old_blk = NULL; + BlockInfoRecord *prefetch_blk = NULL; dsm_segment *seg; /* Establish signal handlers; once that's done, unblock signals. */ @@ -499,6 +502,7 @@ autoprewarm_database_main(Datum main_arg) { relation_close(rel, AccessShareLock); rel = NULL; + io_concurrency = -1; CommitTransactionCommand(); } @@ -518,6 +522,8 @@ autoprewarm_database_main(Datum main_arg) if (!rel) CommitTransactionCommand(); + else + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); } if (!rel) { @@ -550,6 +556,35 @@ autoprewarm_database_main(Datum main_arg) continue; } + /* if prefetching is enabled for this relation */ + if (io_concurrency > 0) + { + /* make prefetch_blk catch up */ + if (blk > prefetch_blk) + { + prefetch_blk = blk; + } + + /* now, prefetch all following blocks */ + while (prefetch_blk <= &block_info[apw_state->prewarm_stop_idx]) + { + /* unless they're of a different relfilenode */ + if (prefetch_blk->filenode != blk->filenode || + prefetch_blk->forknum != blk->forknum || + prefetch_blk->blocknum >= nblocks) + break; + + /* or unless they are more than io_concurrency blocks ahead */ + if (blk + io_concurrency <= prefetch_blk) + break; + + PrefetchBuffer(rel, prefetch_blk->forknum, prefetch_blk->blocknum); + + /* continue with the next block */ + prefetch_blk++; + } + } + /* Prewarm buffer. */ buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL, NULL); diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index 6ac382e165c..42bff343638 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -25,6 +25,7 @@ #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/spccache.h" PG_MODULE_MAGIC; @@ -185,13 +186,19 @@ pg_prewarm(PG_FUNCTION_ARGS) else if (ptype == PREWARM_BUFFER) { BlockNumber prefetch_block = first_block; + Oid nspOid; + int io_concurrency; + + nspOid = rel->rd_rel->reltablespace; + io_concurrency = get_tablespace_maintenance_io_concurrency(nspOid); + /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { Buffer buf; - int prefetch_stop = block + Min(last_block - block + 1, seqscan_prefetch_buffers); + int prefetch_stop = block + Min(last_block - block + 1, io_concurrency); CHECK_FOR_INTERRUPTS(); while (prefetch_block < prefetch_stop) { diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 06cb1ce49e9..ffb374aa539 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -317,6 +317,27 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) scan->rs_startblock = 0; } + if (enable_seqscan_prefetch) + { + /* + * Do not use tablespace setting for catalog scans, as we might have + * the tablespace settings in the catalogs locked already, which + * might result in a deadlock. + */ + if (IsCatalogRelation(scan->rs_base.rs_rd)) + scan->rs_prefetch_maximum = effective_io_concurrency; + else + scan->rs_prefetch_maximum = + get_tablespace_io_concurrency(scan->rs_base.rs_rd->rd_rel->reltablespace); + + scan->rs_prefetch_target = 1; + } + else + { + scan->rs_prefetch_maximum = -1; + scan->rs_prefetch_target = -1; + } + scan->rs_numblocks = InvalidBlockNumber; scan->rs_inited = false; scan->rs_ctup.t_data = NULL; @@ -399,18 +420,18 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) */ CHECK_FOR_INTERRUPTS(); - /* Prefetch up to seqscan_prefetch_buffers blocks ahead */ - if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0) + /* Prefetch up to io_concurrency blocks ahead */ + if (scan->rs_prefetch_maximum > 0 && scan->rs_nblocks > 1) { int64 nblocks; int64 rel_scan_start; int64 rel_scan_end; /* blockno of end of scan (mod scan->rs_nblocks) */ + int64 scan_pageoff; /* page, but adjusted for scan position as above */ int64 prefetch_start; /* start block of prefetch requests this iteration */ int64 prefetch_end; /* end block of prefetch requests this iteration, if applicable */ ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata; - - Assert(seqscan_prefetch_buffers > 0); + ParallelBlockTableScanDesc pbscandesc = (ParallelBlockTableScanDesc) sscan->rs_parallel; /* * Parallel scans look like repeated sequential table scans for @@ -418,12 +439,20 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) */ if (pbscanwork != NULL) { - rel_scan_start = (BlockNumber) pbscanwork->phsw_nallocated + 1 - + pbscanwork->phsw_chunk_remaining - - pbscanwork->phsw_chunk_size; - rel_scan_end = Min(pbscanwork->phsw_nallocated + pbscanwork->phsw_chunk_remaining, - scan->rs_nblocks); - nblocks = pbscanwork->phsw_nallocated + pbscanwork->phsw_chunk_remaining; + uint64 start_offset, + end_offset; + + Assert(pbscandesc != NULL); + start_offset = pbscanwork->phsw_nallocated + + pbscanwork->phsw_chunk_remaining + 1 + - pbscanwork->phsw_chunk_size; + end_offset = Min(pbscanwork->phsw_nallocated + + pbscanwork->phsw_chunk_remaining + 1, + pbscandesc->phs_nblocks); + + rel_scan_start = (int64) (pbscandesc->phs_startblock) + start_offset; + rel_scan_end = (int64) (pbscandesc->phs_startblock) + end_offset; + nblocks = pbscandesc->phs_nblocks; } else { @@ -432,7 +461,14 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) nblocks = scan->rs_nblocks; } - Assert(rel_scan_start <= page && page <= rel_scan_end); + prefetch_end = rel_scan_end; + + if ((uint64) page < rel_scan_start) + scan_pageoff = page + nblocks; + else + scan_pageoff = page; + + Assert(rel_scan_start <= scan_pageoff && scan_pageoff <= rel_scan_end); /* * If this is the first page of this seqscan, initiate prefetch of @@ -442,19 +478,12 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) */ if (rel_scan_start != page) { - prefetch_start = (page + seqscan_prefetch_buffers - 1); - + prefetch_start = scan_pageoff + (int64) scan->rs_prefetch_target - 1; prefetch_end = prefetch_start + 1; - - /* If we've wrapped around, add nblocks to get the block number in the [start, end] range */ - if (page < rel_scan_start) - prefetch_start += nblocks; } else { - /* first block we're fetching, cannot have wrapped around yet */ - prefetch_start = page; - + prefetch_start = scan_pageoff; prefetch_end = rel_scan_end; } @@ -462,8 +491,11 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) if (prefetch_start > rel_scan_end) prefetch_end = 0; - if (prefetch_end > prefetch_start + seqscan_prefetch_buffers) - prefetch_end = prefetch_start + seqscan_prefetch_buffers; + if (prefetch_end > prefetch_start + scan->rs_prefetch_target) + prefetch_end = prefetch_start + scan->rs_prefetch_target; + + if (prefetch_end > rel_scan_end) + prefetch_end = rel_scan_end; while (prefetch_start < prefetch_end) { @@ -473,6 +505,16 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, blckno); prefetch_start += 1; } + + /* + * Use exponential growth of readahead up to prefetch_maximum, to + * make sure that a low LIMIT does not result in high IO overhead, + * but operations in general are still very fast. + */ + if (scan->rs_prefetch_target < scan->rs_prefetch_maximum / 2) + scan->rs_prefetch_target *= 2; + else if (scan->rs_prefetch_target < scan->rs_prefetch_maximum) + scan->rs_prefetch_target = scan->rs_prefetch_maximum; } /* read page using selected strategy */ diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index b050efba431..0abea50ee16 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -65,6 +65,7 @@ #include "utils/memutils.h" #include "utils/pg_rusage.h" #include "utils/timestamp.h" +#include "utils/spccache.h" /* @@ -145,6 +146,9 @@ typedef struct LVRelState Relation *indrels; int nindexes; + /* prefetch */ + int io_concurrency; + /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */ bool aggressive; /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */ @@ -417,6 +421,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, /* Set up high level stuff about rel and its indexes */ vacrel->rel = rel; + vacrel->io_concurrency = + get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes, &vacrel->indrels); if (instrument && vacrel->nindexes > 0) @@ -975,12 +981,11 @@ lazy_scan_heap(LVRelState *vacrel) */ visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); - if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0) - { + if (vacrel->io_concurrency > 0) { /* - * Prefetch seqscan_prefetch_buffers blocks ahead + * Prefetch io_concurrency blocks ahead */ - uint32 prefetch_budget = seqscan_prefetch_buffers; + uint32 prefetch_budget = vacrel->io_concurrency; /* never trail behind the current scan */ if (next_prefetch_block < blkno) @@ -990,9 +995,9 @@ lazy_scan_heap(LVRelState *vacrel) if (prefetch_budget > rel_pages - next_prefetch_block) prefetch_budget = rel_pages - next_prefetch_block; - /* And only up to seqscan_prefetch_buffers ahead of the current vacuum scan */ - if (next_prefetch_block + prefetch_budget > blkno + seqscan_prefetch_buffers) - prefetch_budget = blkno + seqscan_prefetch_buffers - next_prefetch_block; + /* And only up to io_concurrency ahead of the current vacuum scan */ + if (next_prefetch_block + prefetch_budget > blkno + vacrel->io_concurrency) + prefetch_budget = blkno + vacrel->io_concurrency - next_prefetch_block; /* And only up to the next unskippable block */ if (next_prefetch_block + prefetch_budget > next_unskippable_block) @@ -2459,40 +2464,32 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) tblk = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); - if (enable_seqscan_prefetch && seqscan_prefetch_buffers > 0) - { + if (vacrel->io_concurrency > 0) { /* * If we're just starting out, prefetch N consecutive blocks. * If not, only the next 1 block */ - if (pindex == 0) - { + if (pindex == 0) { int prefetch_budget = Min(vacrel->dead_items->num_items, Min(vacrel->rel_pages, - seqscan_prefetch_buffers)); + vacrel->io_concurrency)); BlockNumber prev_prefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); while (++pindex < vacrel->dead_items->num_items && - prefetch_budget > 0) - { + prefetch_budget > 0) { ItemPointer ptr = &vacrel->dead_items->items[pindex]; - if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) - { + if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) { prev_prefetch = ItemPointerGetBlockNumber(ptr); prefetch_budget -= 1; PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); } } - } - else if (pindex < vacrel->dead_items->num_items) - { + } else if (pindex < vacrel->dead_items->num_items) { BlockNumber previous = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); - while (++pindex < vacrel->dead_items->num_items) - { + while (++pindex < vacrel->dead_items->num_items) { BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); - if (previous != toPrefetch) - { + if (previous != toPrefetch) { PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch); break; } diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index ae37ac0644b..20cde3d0aff 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -153,7 +153,6 @@ bool enable_parallel_hash = true; bool enable_partition_pruning = true; bool enable_async_append = true; bool enable_seqscan_prefetch = true; -int seqscan_prefetch_buffers = 0; typedef struct { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index cea86f3e6ad..95bb3d9cd32 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1015,14 +1015,13 @@ static const unit_conversion time_unit_conversion_table[] = static struct config_bool ConfigureNamesBool[] = { { - {"enable_seqscan_prefetch", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Enables the sequence scan next page prefetching."), + {"enable_seqscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of next pages in sequential scans."), NULL, GUC_EXPLAIN }, &enable_seqscan_prefetch, - false, /* temporary disable to be able to merge in main */ - /* true, */ + true, NULL, NULL, NULL }, { @@ -2204,17 +2203,6 @@ static struct config_bool ConfigureNamesBool[] = static struct config_int ConfigureNamesInt[] = { - { - {"seqscan_prefetch_buffers", PGC_USERSET, QUERY_TUNING_METHOD, - gettext_noop("Number of subsequent buffer to be prefetched during sequential scan."), - NULL, - GUC_EXPLAIN - }, - &seqscan_prefetch_buffers, - /* 8, 0, 1000, */ - 0, 0, 1000, /* temporary disable to be able to merge in main */ - NULL, NULL, NULL - }, { {"archive_timeout", PGC_SIGHUP, WAL_ARCHIVING, gettext_noop("Sets the amount of time to wait before forcing a " diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index abf62d9df79..32d2340b795 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -71,6 +71,10 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* prefetch info */ + int rs_prefetch_maximum; /* io_concurrency of tablespace */ + int rs_prefetch_target; /* current readahead target */ + /* these fields only used in page-at-a-time mode and for bitmap scans */ int rs_cindex; /* current tuple's index in vistuples */ int rs_ntuples; /* number of visible tuples on page */ diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index b002c5ff027..d6a15292da6 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -70,7 +70,6 @@ extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; extern PGDLLIMPORT bool enable_async_append; extern PGDLLIMPORT bool enable_seqscan_prefetch; -extern PGDLLIMPORT int seqscan_prefetch_buffers; extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index dda7da83eaa..eef4f7e1d11 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -129,7 +129,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_partitionwise_aggregate | off enable_partitionwise_join | off enable_seqscan | on - enable_seqscan_prefetch | off + enable_seqscan_prefetch | on enable_sort | on enable_tidscan | on (21 rows) From debb9256482793c1b226ddd855b87825792e6fd1 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 21 Nov 2022 18:27:58 +0200 Subject: [PATCH 30/56] Drop unlogged table in regress test to avoid noise in tests --- src/test/regress/expected/spgist.out | 3 +++ src/test/regress/sql/spgist.sql | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/test/regress/expected/spgist.out b/src/test/regress/expected/spgist.out index 2e911285600..c371e04a795 100644 --- a/src/test/regress/expected/spgist.out +++ b/src/test/regress/expected/spgist.out @@ -94,3 +94,6 @@ select box(point(i,j)) from generate_series(1,100,5) i, generate_series(1,10,5) j; -- leave this table around, to help in testing dump/restore +-- NEON: In Neon unlogged tables are wiped away on node restart +-- so drop the table to keep Neon tests clean. +drop table spgist_unlogged_tbl; diff --git a/src/test/regress/sql/spgist.sql b/src/test/regress/sql/spgist.sql index 4828ede68c3..9d6394516a2 100644 --- a/src/test/regress/sql/spgist.sql +++ b/src/test/regress/sql/spgist.sql @@ -89,3 +89,6 @@ select box(point(i,j)) from generate_series(1,100,5) i, generate_series(1,10,5) j; -- leave this table around, to help in testing dump/restore +-- NEON: In Neon unlogged tables are wiped away on node restart +-- so drop the table to keep Neon tests clean. +drop table spgist_unlogged_tbl; From 85bd2ad32aad2a622969d388d31ec42164209ca3 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 24 Nov 2022 11:40:11 +0200 Subject: [PATCH 31/56] Do not produce open file error for unlogged relations (#240) --- src/backend/storage/smgr/md.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 34dfe7ff97e..80554e54b3c 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -815,7 +815,13 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) BlockNumber nblocks; BlockNumber segno; - mdopenfork(reln, forknum, EXTENSION_FAIL); + /* NEON: md smgr is used in Neon for unlogged and temp relations. + * After compute node restart their data is deleted but unlogged tables are still present in system catalog. + * This is a difference with Vanilla Postgres where unlogged relations are truncated only after abnormal termination. + * To avoid "could not open file" we have to use EXTENSION_RETURN_NULL hear instead of EXTENSION_FAIL + */ + if (!mdopenfork(reln, forknum, RelFileNodeBackendIsTemp(reln->smgr_rnode) ? EXTENSION_FAIL : EXTENSION_RETURN_NULL)) + return 0; /* mdopen has opened the first segment */ Assert(reln->md_num_open_segs[forknum] > 0); From 6f72c84e2e6855055f3f0de6c29198d1b648d3d0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 24 Nov 2022 11:45:40 +0200 Subject: [PATCH 32/56] =?UTF-8?q?Maintain=20last=20written=20LSN=20for=20e?= =?UTF-8?q?ach=20page=20to=20enable=20prefetch=20on=20vacuum,=E2=80=A6=20(?= =?UTF-8?q?#245)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Maintain last written LSN for each page to enable prefetch on vacuum, delete and other massive update operations * Move PageSetLSN in heap_xlog_visible before MarkBufferDirty --- src/backend/access/heap/heapam.c | 12 ++++++++++-- src/backend/access/transam/xlog.c | 26 +++++++++----------------- src/backend/utils/misc/guc.c | 4 ++-- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index ffb374aa539..b46fc6428a6 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8986,8 +8986,16 @@ heap_xlog_visible(XLogReaderState *record) PageSetAllVisible(page); - if (XLogHintBitIsNeeded()) - PageSetLSN(page, lsn); + /* + * NEON: despite to the comment above we need to update page LSN here. + * See discussion at hackers: https://www.postgresql.org/message-id/flat/039076d4f6cdd871691686361f83cb8a6913a86a.camel%40j-davis.com#101ba42b004f9988e3d54fce26fb3462 + * For Neon this assignment is critical because otherwise last written LSN tracked at compute doesn't + * match with page LSN assignee by WAL-redo and as a result, prefetched page is rejected. + * + * It is fixed in upstream in https://github.com/neondatabase/postgres/commit/7bf713dd2d0739fbcd4103971ed69c17ebe677ea + * but until it is merged we still need to carry a patch here. + */ + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1f8ba364827..da2e4497e34 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -211,7 +211,7 @@ typedef struct LastWrittenLsnCacheEntry /* - * Cache of last written LSN for each relation chunk (hash bucket). + * Cache of last written LSN for each relation page. * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last * relation metadata update. * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"), @@ -606,8 +606,6 @@ static WALInsertLockPadded *WALInsertLocks = NULL; */ static ControlFileData *ControlFile = NULL; -#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */ - /* * Calculate the amount of space left on the page after 'endptr'. Beware * multiple evaluation! @@ -6104,7 +6102,7 @@ GetInsertRecPtr(void) * It returns an upper bound for the last written LSN of a given page, * either from a cached last written LSN or a global maximum last written LSN. * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. - * If cache is large enough ,iterting through all hash items may be rather expensive. + * If cache is large enough, iterating through all hash items may be rather expensive. * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical. */ XLogRecPtr @@ -6123,7 +6121,7 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) BufferTag key; key.rnode = rnode; key.forkNum = forknum; - key.blockNum = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET; + key.blockNum = blkno; entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); if (entry != NULL) lsn = entry->lsn; @@ -6147,9 +6145,9 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) /* * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range. * We maintain cache of last written LSNs with limited size and LRU replacement - * policy. To reduce cache size we store max LSN not for each page, but for - * bucket (1024 blocks). This cache allows to use old LSN when - * requesting pages of unchanged or appended relations. + * policy. Keeping last written LSN for each page allows to use old LSN when + * requesting pages of unchanged or appended relations. Also it is critical for + * efficient work of prefetch in case massive update operations (like vacuum or remove). * * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated. * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions. @@ -6171,19 +6169,13 @@ SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber for LastWrittenLsnCacheEntry* entry; BufferTag key; bool found; - BlockNumber bucket; - BlockNumber start_bucket; /* inclusive */ - BlockNumber end_bucket; /* exclusive */ - - start_bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET; - end_bucket = from == REL_METADATA_PSEUDO_BLOCKNO - ? start_bucket + 1 : (from + n_blocks + LAST_WRITTEN_LSN_CACHE_BUCKET - 1) / LAST_WRITTEN_LSN_CACHE_BUCKET; + BlockNumber i; key.rnode = rnode; key.forkNum = forknum; - for (bucket = start_bucket; bucket < end_bucket; bucket++) + for (i = 0; i < n_blocks; i++) { - key.blockNum = bucket; + key.blockNum = from + i; entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); if (found) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 95bb3d9cd32..3eced8fa29c 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2440,11 +2440,11 @@ static struct config_int ConfigureNamesInt[] = { {"lsn_cache_size", PGC_POSTMASTER, UNGROUPED, - gettext_noop("Size of las written LSN cache used by Neon."), + gettext_noop("Size of last written LSN cache used by Neon."), NULL }, &lastWrittenLsnCacheSize, - 1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */ + 128*1024, 1024, INT_MAX, NULL, NULL, NULL }, From dcdb9ee21d80e02bf9b06cdc70ebbaa7589a8525 Mon Sep 17 00:00:00 2001 From: MMeent Date: Mon, 5 Dec 2022 16:19:18 +0100 Subject: [PATCH 33/56] Prefetch cleanup: (#246) - Prefetch the pages in index vacuum's sequential scans Implemented in NBTREE, GIST and SP-GIST. BRIN does not have a 2nd phase of vacuum, and both GIN and HASH clean up their indexes in a non-seqscan fashion: GIN scans the btree from left to right, and HASH only scans the initial buckets sequentially. --- contrib/pg_prewarm/pg_prewarm.c | 3 ++- src/backend/access/gist/gistvacuum.c | 20 ++++++++++++++++++++ src/backend/access/hash/hash.c | 16 ++++++++++++++++ src/backend/access/heap/vacuumlazy.c | 25 +++++++++++++++++-------- src/backend/access/nbtree/nbtree.c | 18 ++++++++++++++++++ src/backend/access/spgist/spgvacuum.c | 22 +++++++++++++++++++++- 6 files changed, 94 insertions(+), 10 deletions(-) diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index 42bff343638..b68f81d34d3 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -198,7 +198,8 @@ pg_prewarm(PG_FUNCTION_ARGS) for (block = first_block; block <= last_block; ++block) { Buffer buf; - int prefetch_stop = block + Min(last_block - block + 1, io_concurrency); + BlockNumber prefetch_stop = block + Min(last_block - block + 1, + io_concurrency); CHECK_FOR_INTERRUPTS(); while (prefetch_block < prefetch_stop) { diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index f190decdff2..3af31e9dcae 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -23,6 +23,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "utils/memutils.h" +#include "utils/spccache.h" /* Working state needed by gistbulkdelete */ typedef struct @@ -130,8 +131,14 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BlockNumber num_pages; bool needLock; BlockNumber blkno; + BlockNumber prefetch_blkno; + int io_concurrency; MemoryContext oldctx; + io_concurrency = get_tablespace_maintenance_io_concurrency( + rel->rd_rel->reltablespace + ); + /* * Reset fields that track information about the entire index now. This * avoids double-counting in the case where a single VACUUM command @@ -209,6 +216,7 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, needLock = !RELATION_IS_LOCAL(rel); blkno = GIST_ROOT_BLKNO; + prefetch_blkno = blkno; for (;;) { /* Get the current relation length */ @@ -221,9 +229,21 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; + + if (prefetch_blkno < blkno) + prefetch_blkno = blkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < blkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) + { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++); + gistvacuumpage(&vstate, blkno, blkno); + } } /* diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index c361509d68d..07dc943361d 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -32,6 +32,7 @@ #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" +#include "utils/spccache.h" /* Working state for hashbuild and its callback */ typedef struct @@ -466,13 +467,17 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; + Bucket prf_bucket; Buffer metabuf = InvalidBuffer; HashMetaPage metap; HashMetaPage cachedmetap; + int io_concurrency; tuples_removed = 0; num_index_tuples = 0; + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + /* * We need a copy of the metapage so that we can use its hashm_spares[] * values to compute bucket page addresses, but a cached copy should be @@ -487,9 +492,14 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Scan the buckets that we know exist */ cur_bucket = 0; + prf_bucket = cur_bucket; cur_maxbucket = orig_maxbucket; loop_top: + for (; prf_bucket <= cur_maxbucket && + prf_bucket < cur_bucket + io_concurrency; prf_bucket++) + PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket)); + while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; @@ -500,6 +510,12 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Page page; bool split_cleanup = false; + if (io_concurrency > 0 && prf_bucket <= cur_maxbucket) + { + PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket)); + prf_bucket++; + } + /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 0abea50ee16..86a24a630d7 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -981,7 +981,8 @@ lazy_scan_heap(LVRelState *vacrel) */ visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); - if (vacrel->io_concurrency > 0) { + if (vacrel->io_concurrency > 0) + { /* * Prefetch io_concurrency blocks ahead */ @@ -2464,12 +2465,14 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) tblk = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); - if (vacrel->io_concurrency > 0) { + if (vacrel->io_concurrency > 0) + { /* * If we're just starting out, prefetch N consecutive blocks. * If not, only the next 1 block */ - if (pindex == 0) { + if (pindex == 0) + { int prefetch_budget = Min(vacrel->dead_items->num_items, Min(vacrel->rel_pages, vacrel->io_concurrency)); @@ -2477,19 +2480,25 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); while (++pindex < vacrel->dead_items->num_items && - prefetch_budget > 0) { + prefetch_budget > 0) + { ItemPointer ptr = &vacrel->dead_items->items[pindex]; - if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) { + if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) + { prev_prefetch = ItemPointerGetBlockNumber(ptr); prefetch_budget -= 1; PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); } } - } else if (pindex < vacrel->dead_items->num_items) { + } + else if (pindex < vacrel->dead_items->num_items) + { BlockNumber previous = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); - while (++pindex < vacrel->dead_items->num_items) { + while (++pindex < vacrel->dead_items->num_items) + { BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); - if (previous != toPrefetch) { + if (previous != toPrefetch) + { PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch); break; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 1419476d704..f5c11e31a1c 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -37,6 +37,7 @@ #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" +#include "utils/spccache.h" /* @@ -908,6 +909,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTVacState vstate; BlockNumber num_pages; BlockNumber scanblkno; + BlockNumber prefetch_blkno; + int io_concurrency; bool needLock; /* @@ -947,6 +950,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.maxbufsize = 0; vstate.pendingpages = NULL; vstate.npendingpages = 0; + + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + /* Consider applying _bt_pendingfsm_finalize optimization */ _bt_pendingfsm_init(rel, &vstate, (callback == NULL)); @@ -975,6 +981,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, needLock = !RELATION_IS_LOCAL(rel); scanblkno = BTREE_METAPAGE + 1; + prefetch_blkno = scanblkno; + for (;;) { /* Get the current relation length */ @@ -991,9 +999,19 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Quit if we've scanned the whole relation */ if (scanblkno >= num_pages) break; + + if (prefetch_blkno < scanblkno) + prefetch_blkno = scanblkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < scanblkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; scanblkno < num_pages; scanblkno++) { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++); + btvacuumpage(&vstate, scanblkno); if (info->report_progress) pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 00496305320..74b8f988e63 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -27,6 +27,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "utils/snapmgr.h" +#include "utils/spccache.h" /* Entry in pending-list of TIDs we need to revisit */ @@ -796,7 +797,14 @@ spgvacuumscan(spgBulkDeleteState *bds) Relation index = bds->info->index; bool needLock; BlockNumber num_pages, - blkno; + blkno, + prefetch_blkno; + int io_concurrency; + + /* initiate concurrency */ + io_concurrency = get_tablespace_maintenance_io_concurrency( + index->rd_rel->reltablespace + ); /* Finish setting up spgBulkDeleteState */ initSpGistState(&bds->spgstate, index); @@ -824,6 +832,8 @@ spgvacuumscan(spgBulkDeleteState *bds) * in btvacuumscan(). */ blkno = SPGIST_METAPAGE_BLKNO + 1; + prefetch_blkno = blkno; + for (;;) { /* Get the current relation length */ @@ -836,9 +846,19 @@ spgvacuumscan(spgBulkDeleteState *bds) /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; + + if (prefetch_blkno < blkno) + prefetch_blkno = blkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < blkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno++); + spgvacuumpage(bds, blkno); /* empty the pending-list after each page */ if (bds->pendingList != NULL) From fc754e5d7514b8624b51cc2ea0d4643808395ad0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 8 Dec 2022 17:44:29 +0200 Subject: [PATCH 34/56] Set lsn fix v15 (#252) * Show prefetch statistic in EXPLAIN refer #2994 * Update heap pge LSN in case of VM changes even if wal_redo_hints=off refer #2807 * Undo occasional changes * Undo occasional changes --- src/backend/access/heap/visibilitymap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 5332488bbe5..669a65b04fc 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -290,7 +290,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, * If data checksums are enabled (or wal_log_hints=on), we * need to protect the heap page from being torn. */ + /* NEON: we have to update page LSN even if wal_log_hints=off if (XLogHintBitIsNeeded()) + */ { Page heapPage = BufferGetPage(heapBuf); From 7cb2db7fc8563fa04daf7b8126076c0a9a9df0bb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 12 Jan 2023 15:11:47 +0200 Subject: [PATCH 35/56] Show prefetch statistic in EXPLAIN (#249) * Show prefetch statistic in EXPLAIN refer #2994 * Collect per-node prefetch statistics * Show number of prefetch duplicates in explain --- src/backend/commands/explain.c | 38 +++++++++++++++++++++++++++++-- src/backend/executor/instrument.c | 8 +++++++ src/include/commands/explain.h | 1 + src/include/executor/instrument.h | 9 ++++++++ 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 060c6186ddd..1bd8fe7616a 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -47,7 +47,6 @@ ExplainOneQuery_hook_type ExplainOneQuery_hook = NULL; /* Hook for plugins to get control in explain_get_index_name() */ explain_get_index_name_hook_type explain_get_index_name_hook = NULL; - /* OR-able flags for ExplainXMLTag() */ #define X_OPENING 0 #define X_CLOSING 1 @@ -121,6 +120,7 @@ static void show_eval_params(Bitmapset *bms_params, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning); +static void show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir, ExplainState *es); @@ -186,6 +186,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt, es->costs = defGetBoolean(opt); else if (strcmp(opt->defname, "buffers") == 0) es->buffers = defGetBoolean(opt); + else if (strcmp(opt->defname, "prefetch") == 0) + es->prefetch = defGetBoolean(opt); else if (strcmp(opt->defname, "wal") == 0) es->wal = defGetBoolean(opt); else if (strcmp(opt->defname, "settings") == 0) @@ -534,7 +536,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, else if (es->analyze) instrument_option |= INSTRUMENT_ROWS; - if (es->buffers) + if (es->buffers || es->prefetch) instrument_option |= INSTRUMENT_BUFFERS; if (es->wal) instrument_option |= INSTRUMENT_WAL; @@ -2066,6 +2068,10 @@ ExplainNode(PlanState *planstate, List *ancestors, if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->walusage); + /* Show prefetch usage */ + if (es->prefetch && planstate->instrument) + show_prefetch_info(es, &planstate->instrument->bufusage.prefetch); + /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) { @@ -3501,6 +3507,34 @@ explain_get_index_name(Oid indexId) return result; } +/* + * Show prefetch statistics + */ +static void +show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info) +{ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Prefetch: hits=%lld misses=%lld expired=%lld duplicates=%lld\n", + (long long) prefetch_info->hits, + (long long) prefetch_info->misses, + (long long) prefetch_info->expired, + (long long) prefetch_info->duplicates); + } + else + { + ExplainPropertyInteger("Prefetch Hits", NULL, + prefetch_info->hits, es); + ExplainPropertyInteger("Prefetch Misses", NULL, + prefetch_info->misses, es); + ExplainPropertyInteger("Prefetch Expired Requests", NULL, + prefetch_info->expired, es); + ExplainPropertyInteger("Prefetch Duplicated Requests", NULL, + prefetch_info->duplicates, es); + } +} + /* * Show buffer usage details. */ diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index ceff4727d4a..e7ddea8861a 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -235,6 +235,10 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) dst->local_blks_written += add->local_blks_written; dst->temp_blks_read += add->temp_blks_read; dst->temp_blks_written += add->temp_blks_written; + dst->prefetch.hits += add->prefetch.hits; + dst->prefetch.misses += add->prefetch.misses; + dst->prefetch.expired += add->prefetch.expired; + dst->prefetch.duplicates += add->prefetch.duplicates; INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time); INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time); INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time); @@ -257,6 +261,10 @@ BufferUsageAccumDiff(BufferUsage *dst, dst->local_blks_written += add->local_blks_written - sub->local_blks_written; dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; + dst->prefetch.hits += add->prefetch.hits - sub->prefetch.hits; + dst->prefetch.misses += add->prefetch.misses - sub->prefetch.misses; + dst->prefetch.expired += add->prefetch.expired - sub->prefetch.expired; + dst->prefetch.duplicates += add->prefetch.duplicates - sub->prefetch.duplicates; INSTR_TIME_ACCUM_DIFF(dst->blk_read_time, add->blk_read_time, sub->blk_read_time); INSTR_TIME_ACCUM_DIFF(dst->blk_write_time, diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 666977fb1f8..642e267f500 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -46,6 +46,7 @@ typedef struct ExplainState bool timing; /* print detailed node timing */ bool summary; /* print total planning and execution timing */ bool settings; /* print modified settings */ + bool prefetch; /* print prefetch statistic */ ExplainFormat format; /* output format */ /* state for output formatting --- not reset for each new plan tree */ int indent; /* current indentation level */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 2945cce3a97..ba396caf18b 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -15,6 +15,14 @@ #include "portability/instr_time.h" +/* Prefeth statistics */ +typedef struct +{ + int64 hits; + int64 misses; + int64 expired; + int64 duplicates; +} PrefetchInfo; /* * BufferUsage and WalUsage counters keep being incremented infinitely, @@ -37,6 +45,7 @@ typedef struct BufferUsage instr_time blk_write_time; /* time spent writing blocks */ instr_time temp_blk_read_time; /* time spent reading temp blocks */ instr_time temp_blk_write_time; /* time spent writing temp blocks */ + PrefetchInfo prefetch; /* prefetch statistics */ } BufferUsage; /* From a52fb5a4e9390833e91198eab715704e0e395cec Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 14 Feb 2023 19:00:12 +0200 Subject: [PATCH 36/56] Implement efficient prefetch for parallel bitmap heap scan (#258) * Implement efficient prefetch for parallel bitmap heap scan * Change MAX_IO_CONCURRENCY to be power of 2 --- src/backend/executor/nodeBitmapHeapscan.c | 194 +++++++--------------- src/include/nodes/execnodes.h | 16 +- src/include/storage/bufmgr.h | 4 +- 3 files changed, 80 insertions(+), 134 deletions(-) diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index f6fe07ad703..302930c396a 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -150,6 +150,8 @@ BitmapHeapNext(BitmapHeapScanState *node) */ pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); #ifdef USE_PREFETCH + node->n_prefetch_requests = 0; + node->prefetch_request_pos = 0; if (node->prefetch_maximum > 0) { pstate->prefetch_iterator = @@ -173,13 +175,6 @@ BitmapHeapNext(BitmapHeapScanState *node) tbm_attach_shared_iterate(dsa, pstate->tbmiterator); node->tbmres = tbmres = NULL; -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - node->shared_prefetch_iterator = - tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator); - } -#endif /* USE_PREFETCH */ } node->initialized = true; } @@ -198,15 +193,24 @@ BitmapHeapNext(BitmapHeapScanState *node) if (!pstate) node->tbmres = tbmres = tbm_iterate(tbmiterator); else - node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); + { + if (node->n_prefetch_requests != 0) + { + node->tbmres = tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_request_pos]; + node->n_prefetch_requests -= 1; + node->prefetch_request_pos = (node->prefetch_request_pos + 1) % MAX_IO_CONCURRENCY; + if (node->prefetch_pages != 0) + node->prefetch_pages -= 1; + } + else + node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); + } if (tbmres == NULL) { /* no more entries in the bitmap */ break; } - BitmapAdjustPrefetchIterator(node, tbmres); - /* * We can skip fetching the heap page if we don't need any fields * from the heap, and the bitmap entries don't need rechecking, @@ -361,54 +365,21 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, TBMIterateResult *tbmres) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; + TBMIterator *prefetch_iterator = node->prefetch_iterator; + Assert(node->pstate == NULL); - if (pstate == NULL) + if (node->prefetch_pages > 0) { - TBMIterator *prefetch_iterator = node->prefetch_iterator; - - if (node->prefetch_pages > 0) - { - /* The main iterator has closed the distance by one page */ - node->prefetch_pages--; - } - else if (prefetch_iterator) - { - /* Do not let the prefetch iterator get behind the main one */ - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - - if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) - elog(ERROR, "prefetch and main iterators are out of sync"); - } - return; + /* The main iterator has closed the distance by one page */ + node->prefetch_pages--; } - - if (node->prefetch_maximum > 0) + else if (prefetch_iterator) { - TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; + /* Do not let the prefetch iterator get behind the main one */ + TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages > 0) - { - pstate->prefetch_pages--; - SpinLockRelease(&pstate->mutex); - } - else - { - /* Release the mutex before iterating */ - SpinLockRelease(&pstate->mutex); - - /* - * In case of shared mode, we can not ensure that the current - * blockno of the main iterator and that of the prefetch iterator - * are same. It's possible that whatever blockno we are - * prefetching will be processed by another process. Therefore, - * we don't validate the blockno here as we do in non-parallel - * case. - */ - if (prefetch_iterator) - tbm_shared_iterate(prefetch_iterator); - } + if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) + elog(ERROR, "prefetch and main iterators are out of sync"); } #endif /* USE_PREFETCH */ } @@ -425,35 +396,14 @@ static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - - if (pstate == NULL) - { - if (node->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (node->prefetch_target >= node->prefetch_maximum / 2) - node->prefetch_target = node->prefetch_maximum; - else if (node->prefetch_target > 0) - node->prefetch_target *= 2; - else - node->prefetch_target++; - return; - } - - /* Do an unlocked check first to save spinlock acquisitions. */ - if (pstate->prefetch_target < node->prefetch_maximum) - { - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (pstate->prefetch_target >= node->prefetch_maximum / 2) - pstate->prefetch_target = node->prefetch_maximum; - else if (pstate->prefetch_target > 0) - pstate->prefetch_target *= 2; - else - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } + if (node->prefetch_target >= node->prefetch_maximum) + /* don't increase any further */ ; + else if (node->prefetch_target >= node->prefetch_maximum / 2) + node->prefetch_target = node->prefetch_maximum; + else if (node->prefetch_target > 0) + node->prefetch_target *= 2; + else + node->prefetch_target++; #endif /* USE_PREFETCH */ } @@ -507,56 +457,46 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } - - return; } - - if (pstate->prefetch_pages < pstate->prefetch_target) + else { - TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; - - if (prefetch_iterator) + while (1) { - while (1) - { - TBMIterateResult *tbmpre; - bool do_prefetch = false; - bool skip_fetch; + TBMIterateResult *tbmpre; + bool do_prefetch = false; + bool skip_fetch; - /* - * Recheck under the mutex. If some other process has already - * done enough prefetching then we need not to do anything. - */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages < pstate->prefetch_target) - { - pstate->prefetch_pages++; - do_prefetch = true; - } - SpinLockRelease(&pstate->mutex); + if (node->prefetch_pages < node->prefetch_target) + { + Assert(node->n_prefetch_requests < MAX_IO_CONCURRENCY); + node->prefetch_pages++; + do_prefetch = true; + } - if (!do_prefetch) - return; + if (!do_prefetch) + return; - tbmpre = tbm_shared_iterate(prefetch_iterator); - if (tbmpre == NULL) - { - /* No more pages to prefetch */ - tbm_end_shared_iterate(prefetch_iterator); - node->shared_prefetch_iterator = NULL; - break; - } + tbmpre = tbm_shared_iterate(node->shared_tbmiterator); + if (tbmpre != NULL) + { + memcpy(&node->prefetch_requests[(node->prefetch_request_pos + node->n_prefetch_requests) % MAX_IO_CONCURRENCY], tbmpre, sizeof(TBMIteratePrefetchResult)); + node->n_prefetch_requests += 1; + } + else + { + /* No more pages to prefetch */ + break; + } - /* As above, skip prefetch if we expect not to need page */ - skip_fetch = (node->can_skip_fetch && - (node->tbmres ? !node->tbmres->recheck : false) && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, - &node->pvmbuffer)); + /* As above, skip prefetch if we expect not to need page */ + skip_fetch = (node->can_skip_fetch && + (node->tbmres ? !node->tbmres->recheck : false) && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmpre->blockno, + &node->pvmbuffer)); - if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); - } + if (!skip_fetch) + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } #endif /* USE_PREFETCH */ @@ -613,8 +553,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) tbm_end_iterate(node->prefetch_iterator); if (node->shared_tbmiterator) tbm_end_shared_iterate(node->shared_tbmiterator); - if (node->shared_prefetch_iterator) - tbm_end_shared_iterate(node->shared_prefetch_iterator); if (node->tbm) tbm_free(node->tbm); if (node->vmbuffer != InvalidBuffer) @@ -627,7 +565,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) node->prefetch_iterator = NULL; node->initialized = false; node->shared_tbmiterator = NULL; - node->shared_prefetch_iterator = NULL; node->vmbuffer = InvalidBuffer; node->pvmbuffer = InvalidBuffer; @@ -683,8 +620,6 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) tbm_free(node->tbm); if (node->shared_tbmiterator) tbm_end_shared_iterate(node->shared_tbmiterator); - if (node->shared_prefetch_iterator) - tbm_end_shared_iterate(node->shared_prefetch_iterator); if (node->vmbuffer != InvalidBuffer) ReleaseBuffer(node->vmbuffer); if (node->pvmbuffer != InvalidBuffer) @@ -739,7 +674,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->pscan_len = 0; scanstate->initialized = false; scanstate->shared_tbmiterator = NULL; - scanstate->shared_prefetch_iterator = NULL; scanstate->pstate = NULL; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 9fa23e2bb66..ecd313880a2 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -23,6 +23,7 @@ #include "nodes/plannodes.h" #include "nodes/tidbitmap.h" #include "partitioning/partdefs.h" +#include "storage/bufmgr.h" #include "storage/condition_variable.h" #include "utils/hsearch.h" #include "utils/queryenvironment.h" @@ -1690,6 +1691,15 @@ typedef struct ParallelBitmapHeapState char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; } ParallelBitmapHeapState; +typedef struct TBMIteratePrefetchResult +{ + BlockNumber blockno; /* page number containing tuples */ + int ntuples; /* -1 indicates lossy result */ + bool recheck; /* should the tuples be rechecked? */ + /* Note: recheck is always true if ntuples < 0 */ + OffsetNumber offsets[MaxHeapTuplesPerPage]; +} TBMIteratePrefetchResult; + /* ---------------- * BitmapHeapScanState information * @@ -1710,7 +1720,6 @@ typedef struct ParallelBitmapHeapState * pscan_len size of the shared memory for parallel bitmap * initialized is node is ready to iterate * shared_tbmiterator shared iterator - * shared_prefetch_iterator shared iterator for prefetching * pstate shared state for parallel bitmap scan * ---------------- */ @@ -1734,7 +1743,10 @@ typedef struct BitmapHeapScanState Size pscan_len; bool initialized; TBMSharedIterator *shared_tbmiterator; - TBMSharedIterator *shared_prefetch_iterator; + /* parallel worker private ring buffer with prefetch requests: it allows to access prefetch result from the same worker */ + TBMIteratePrefetchResult prefetch_requests[MAX_IO_CONCURRENCY]; + int n_prefetch_requests; /* number of used elements in prefetch_requests ring buffer */ + int prefetch_request_pos; /* head position in ring buffer */ ParallelBitmapHeapState *pstate; } BitmapHeapScanState; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 708884aa3db..ccda751e9e0 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -86,8 +86,8 @@ extern PGDLLIMPORT int NLocBuffer; extern PGDLLIMPORT Block *LocalBufferBlockPointers; extern PGDLLIMPORT int32 *LocalRefCount; -/* upper limit for effective_io_concurrency */ -#define MAX_IO_CONCURRENCY 1000 +/* upper limit for effective_io_concurrency (better to he power of 2) */ +#define MAX_IO_CONCURRENCY 1024 /* special block number for ReadBuffer() */ #define P_NEW InvalidBlockNumber /* grow the file to get a new page */ From 9fc4107f223fc83ccd7b865970bdff7fff5e9ac0 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 22 Feb 2023 18:30:24 +0200 Subject: [PATCH 37/56] Unlogged index fix v15 (#262) * Avoid errors when accessing indexes of unlogge tables after compute restart * Support unlogged sequences * Extract sequence start value from pg_sequence * Initialize unlogged index undex eclusive lock --- src/backend/commands/sequence.c | 53 +++++++++++++++++++--------- src/backend/optimizer/util/plancat.c | 38 +++++++++++++++++++- src/backend/storage/smgr/md.c | 25 +++++++++++-- 3 files changed, 97 insertions(+), 19 deletions(-) diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 88e0e3a270a..5999cb8784d 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -98,7 +98,7 @@ static HTAB *seqhashtab = NULL; /* hash table for SeqTable items */ static SeqTableData *last_used_seq = NULL; static void fill_seq_with_data(Relation rel, HeapTuple tuple); -static void fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum); +static void fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum, Buffer buf); static Relation lock_and_open_sequence(SeqTable seq); static void create_seq_hashtable(void); static void init_sequence(Oid relid, SeqTable *p_elm, Relation *p_rel); @@ -351,7 +351,7 @@ ResetSequence(Oid seq_relid) static void fill_seq_with_data(Relation rel, HeapTuple tuple) { - fill_seq_fork_with_data(rel, tuple, MAIN_FORKNUM); + fill_seq_fork_with_data(rel, tuple, MAIN_FORKNUM, InvalidBuffer); if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) { @@ -360,7 +360,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) srel = smgropen(rel->rd_node, InvalidBackendId, rel->rd_rel->relpersistence); smgrcreate(srel, INIT_FORKNUM, false); log_smgrcreate(&rel->rd_node, INIT_FORKNUM); - fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM); + fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM, InvalidBuffer); FlushRelationBuffers(rel); smgrclose(srel); } @@ -370,28 +370,28 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) * Initialize a sequence's relation fork with the specified tuple as content */ static void -fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum) +fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum, Buffer buf) { - Buffer buf; Page page; sequence_magic *sm; OffsetNumber offnum; + bool lockBuffer = false; /* Initialize first page of relation with special magic number */ - - buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL); - Assert(BufferGetBlockNumber(buf) == 0); - + if (buf == InvalidBuffer) + { + buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL); + Assert(BufferGetBlockNumber(buf) == 0); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + lockBuffer = true; + } page = BufferGetPage(buf); - PageInit(page, BufferGetPageSize(buf), sizeof(sequence_magic)); sm = (sequence_magic *) PageGetSpecialPointer(page); sm->magic = SEQ_MAGIC; /* Now insert sequence tuple */ - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - /* * Since VACUUM does not process sequences, we have to force the tuple to * have xmin = FrozenTransactionId now. Otherwise it would become @@ -440,7 +440,8 @@ fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum) END_CRIT_SECTION(); - UnlockReleaseBuffer(buf); + if (lockBuffer) + UnlockReleaseBuffer(buf); } /* @@ -1215,9 +1216,29 @@ read_seq_tuple(Relation rel, Buffer *buf, HeapTuple seqdatatuple) sm = (sequence_magic *) PageGetSpecialPointer(page); if (sm->magic != SEQ_MAGIC) - elog(ERROR, "bad magic number in sequence \"%s\": %08X", - RelationGetRelationName(rel), sm->magic); - + { + /* NEON: reinitialize unlogged sequence */ + if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) + { + Datum value[SEQ_COL_LASTCOL] = {0}; + bool null[SEQ_COL_LASTCOL] = {false}; + HeapTuple tuple; + Form_pg_sequence pgsform; + + tuple = SearchSysCache1(SEQRELID, RelationGetRelid(rel)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for sequence %u", RelationGetRelid(rel)); + pgsform = (Form_pg_sequence) GETSTRUCT(tuple); + value[SEQ_COL_LASTVAL-1] = Int64GetDatumFast(pgsform->seqstart); + ReleaseSysCache(tuple); + + tuple = heap_form_tuple(RelationGetDescr(rel), value, null); + fill_seq_fork_with_data(rel, tuple, MAIN_FORKNUM, *buf); + } + else + elog(ERROR, "bad magic number in sequence \"%s\": %08X", + RelationGetRelationName(rel), sm->magic); + } lp = PageGetItemId(page, FirstOffsetNumber); Assert(ItemIdIsNormal(lp)); diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 419f2ac55fa..7d8e4d54678 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -27,6 +27,7 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/heap.h" +#include "catalog/index.h" #include "catalog/pg_am.h" #include "catalog/pg_proc.h" #include "catalog/pg_statistic_ext.h" @@ -47,6 +48,8 @@ #include "rewrite/rewriteManip.h" #include "statistics/statistics.h" #include "storage/bufmgr.h" +#include "storage/buf_internals.h" +#include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/partcache.h" @@ -81,6 +84,39 @@ static void set_baserel_partition_key_exprs(Relation relation, static void set_baserel_partition_constraint(Relation relation, RelOptInfo *rel); +static bool +is_index_valid(Relation index, LOCKMODE lmode) +{ + if (!index->rd_index->indisvalid) + return false; + + if (index->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) + { + while (true) + { + Buffer metapage = ReadBuffer(index, 0); + bool isNew = PageIsNew(BufferGetPage(metapage)); + ReleaseBuffer(metapage); + if (isNew) + { + Relation heap; + if (lmode != ExclusiveLock) + { + UnlockRelation(index, lmode); + LockRelation(index, ExclusiveLock); + lmode = ExclusiveLock; + continue; + } + DropRelFileNodesAllBuffers(&index->rd_smgr, 1); + heap = RelationIdGetRelation(index->rd_index->indrelid); + index->rd_indam->ambuild(heap, index, BuildIndexInfo(index)); + RelationClose(heap); + } + break; + } + } + return true; +} /* * get_relation_info - @@ -224,7 +260,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * still needs to insert into "invalid" indexes, if they're marked * indisready. */ - if (!index->indisvalid) + if (!is_index_valid(indexRelation, lmode)) { index_close(indexRelation, NoLock); continue; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 80554e54b3c..ae4f80d9506 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -528,6 +528,13 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + /* + * NEON: unlogged relation files are lost after compute restart - we need to implicitly recreate them + * to allow data insertion + */ + if (fd < 0 && (behavior & EXTENSION_CREATE)) + fd = PathNameOpenFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) { if ((behavior & EXTENSION_RETURN_NULL) && @@ -692,9 +699,23 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_rnode.node.relNode, reln->smgr_rnode.backend); + /* NEON: md smgr is used in Neon for unlogged and temp relations. + * After compute node restart their data is deleted but unlogged tables are still present in system catalog. + * This is a difference with Vanilla Postgres where unlogged relations are truncated only after abnormal termination. + * To avoid "could not open file" we have to use EXTENSION_RETURN_NULL hear instead of EXTENSION_FAIL + */ v = _mdfd_getseg(reln, forknum, blocknum, false, - EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - + RelFileNodeBackendIsTemp(reln->smgr_rnode) + ? EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY + : EXTENSION_RETURN_NULL); + if (v == NULL) + { + char* path = relpath(reln->smgr_rnode, forknum); + (void)PathNameOpenFile(path, O_RDWR | O_CREAT | PG_BINARY); + pfree(path); + MemSet(buffer, 0, BLCKSZ); + return; + } seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); From 7b55f30d73a54d42e5d75944cc607e8612cf8beb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 23 Feb 2023 15:21:07 +0200 Subject: [PATCH 38/56] Fix bitmap scan prefetch (#261) --- src/backend/executor/nodeBitmapHeapscan.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 302930c396a..1cb11abb701 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -211,6 +211,8 @@ BitmapHeapNext(BitmapHeapScanState *node) break; } + BitmapAdjustPrefetchIterator(node, tbmres); + /* * We can skip fetching the heap page if we don't need any fields * from the heap, and the bitmap entries don't need rechecking, @@ -366,7 +368,10 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, { #ifdef USE_PREFETCH TBMIterator *prefetch_iterator = node->prefetch_iterator; - Assert(node->pstate == NULL); + + /* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */ + if (node->pstate != NULL) + return; if (node->prefetch_pages > 0) { @@ -396,6 +401,10 @@ static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) { #ifdef USE_PREFETCH + /* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */ + if (node->pstate != NULL) + return; + if (node->prefetch_target >= node->prefetch_maximum) /* don't increase any further */ ; else if (node->prefetch_target >= node->prefetch_maximum / 2) From 513b58c58ced66b2164823d805a6806e9ed6700a Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Fri, 24 Feb 2023 18:30:01 +0400 Subject: [PATCH 39/56] Revert handling of UNLOGGED tables on compute side v15. They will be handled in pageserver, ref neondatabase/neon#3706 This reverts commit ad5e78926bb6c35161359d00423c813e204e63d2 This reverts commit 46c44e80114a088db3b14fb80987c3194129644f This does *not* revert commit 285cd13. We likely should do that, but check_restored_datadir_content complains in some diff in init fork contents after test_pg_regress, this should be sorted out. --- src/backend/commands/sequence.c | 53 +++++++++------------------- src/backend/optimizer/util/plancat.c | 38 +------------------- src/backend/storage/smgr/md.c | 33 ++--------------- 3 files changed, 20 insertions(+), 104 deletions(-) diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 5999cb8784d..88e0e3a270a 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -98,7 +98,7 @@ static HTAB *seqhashtab = NULL; /* hash table for SeqTable items */ static SeqTableData *last_used_seq = NULL; static void fill_seq_with_data(Relation rel, HeapTuple tuple); -static void fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum, Buffer buf); +static void fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum); static Relation lock_and_open_sequence(SeqTable seq); static void create_seq_hashtable(void); static void init_sequence(Oid relid, SeqTable *p_elm, Relation *p_rel); @@ -351,7 +351,7 @@ ResetSequence(Oid seq_relid) static void fill_seq_with_data(Relation rel, HeapTuple tuple) { - fill_seq_fork_with_data(rel, tuple, MAIN_FORKNUM, InvalidBuffer); + fill_seq_fork_with_data(rel, tuple, MAIN_FORKNUM); if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) { @@ -360,7 +360,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) srel = smgropen(rel->rd_node, InvalidBackendId, rel->rd_rel->relpersistence); smgrcreate(srel, INIT_FORKNUM, false); log_smgrcreate(&rel->rd_node, INIT_FORKNUM); - fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM, InvalidBuffer); + fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM); FlushRelationBuffers(rel); smgrclose(srel); } @@ -370,28 +370,28 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) * Initialize a sequence's relation fork with the specified tuple as content */ static void -fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum, Buffer buf) +fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum) { + Buffer buf; Page page; sequence_magic *sm; OffsetNumber offnum; - bool lockBuffer = false; /* Initialize first page of relation with special magic number */ - if (buf == InvalidBuffer) - { - buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL); - Assert(BufferGetBlockNumber(buf) == 0); - LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - lockBuffer = true; - } + + buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL); + Assert(BufferGetBlockNumber(buf) == 0); + page = BufferGetPage(buf); + PageInit(page, BufferGetPageSize(buf), sizeof(sequence_magic)); sm = (sequence_magic *) PageGetSpecialPointer(page); sm->magic = SEQ_MAGIC; /* Now insert sequence tuple */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + /* * Since VACUUM does not process sequences, we have to force the tuple to * have xmin = FrozenTransactionId now. Otherwise it would become @@ -440,8 +440,7 @@ fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum, Buffe END_CRIT_SECTION(); - if (lockBuffer) - UnlockReleaseBuffer(buf); + UnlockReleaseBuffer(buf); } /* @@ -1216,29 +1215,9 @@ read_seq_tuple(Relation rel, Buffer *buf, HeapTuple seqdatatuple) sm = (sequence_magic *) PageGetSpecialPointer(page); if (sm->magic != SEQ_MAGIC) - { - /* NEON: reinitialize unlogged sequence */ - if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) - { - Datum value[SEQ_COL_LASTCOL] = {0}; - bool null[SEQ_COL_LASTCOL] = {false}; - HeapTuple tuple; - Form_pg_sequence pgsform; - - tuple = SearchSysCache1(SEQRELID, RelationGetRelid(rel)); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for sequence %u", RelationGetRelid(rel)); - pgsform = (Form_pg_sequence) GETSTRUCT(tuple); - value[SEQ_COL_LASTVAL-1] = Int64GetDatumFast(pgsform->seqstart); - ReleaseSysCache(tuple); - - tuple = heap_form_tuple(RelationGetDescr(rel), value, null); - fill_seq_fork_with_data(rel, tuple, MAIN_FORKNUM, *buf); - } - else - elog(ERROR, "bad magic number in sequence \"%s\": %08X", - RelationGetRelationName(rel), sm->magic); - } + elog(ERROR, "bad magic number in sequence \"%s\": %08X", + RelationGetRelationName(rel), sm->magic); + lp = PageGetItemId(page, FirstOffsetNumber); Assert(ItemIdIsNormal(lp)); diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 7d8e4d54678..419f2ac55fa 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -27,7 +27,6 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/heap.h" -#include "catalog/index.h" #include "catalog/pg_am.h" #include "catalog/pg_proc.h" #include "catalog/pg_statistic_ext.h" @@ -48,8 +47,6 @@ #include "rewrite/rewriteManip.h" #include "statistics/statistics.h" #include "storage/bufmgr.h" -#include "storage/buf_internals.h" -#include "storage/lmgr.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/partcache.h" @@ -84,39 +81,6 @@ static void set_baserel_partition_key_exprs(Relation relation, static void set_baserel_partition_constraint(Relation relation, RelOptInfo *rel); -static bool -is_index_valid(Relation index, LOCKMODE lmode) -{ - if (!index->rd_index->indisvalid) - return false; - - if (index->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) - { - while (true) - { - Buffer metapage = ReadBuffer(index, 0); - bool isNew = PageIsNew(BufferGetPage(metapage)); - ReleaseBuffer(metapage); - if (isNew) - { - Relation heap; - if (lmode != ExclusiveLock) - { - UnlockRelation(index, lmode); - LockRelation(index, ExclusiveLock); - lmode = ExclusiveLock; - continue; - } - DropRelFileNodesAllBuffers(&index->rd_smgr, 1); - heap = RelationIdGetRelation(index->rd_index->indrelid); - index->rd_indam->ambuild(heap, index, BuildIndexInfo(index)); - RelationClose(heap); - } - break; - } - } - return true; -} /* * get_relation_info - @@ -260,7 +224,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * still needs to insert into "invalid" indexes, if they're marked * indisready. */ - if (!is_index_valid(indexRelation, lmode)) + if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index ae4f80d9506..34dfe7ff97e 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -528,13 +528,6 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); - /* - * NEON: unlogged relation files are lost after compute restart - we need to implicitly recreate them - * to allow data insertion - */ - if (fd < 0 && (behavior & EXTENSION_CREATE)) - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | PG_BINARY); - if (fd < 0) { if ((behavior & EXTENSION_RETURN_NULL) && @@ -699,23 +692,9 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_rnode.node.relNode, reln->smgr_rnode.backend); - /* NEON: md smgr is used in Neon for unlogged and temp relations. - * After compute node restart their data is deleted but unlogged tables are still present in system catalog. - * This is a difference with Vanilla Postgres where unlogged relations are truncated only after abnormal termination. - * To avoid "could not open file" we have to use EXTENSION_RETURN_NULL hear instead of EXTENSION_FAIL - */ v = _mdfd_getseg(reln, forknum, blocknum, false, - RelFileNodeBackendIsTemp(reln->smgr_rnode) - ? EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY - : EXTENSION_RETURN_NULL); - if (v == NULL) - { - char* path = relpath(reln->smgr_rnode, forknum); - (void)PathNameOpenFile(path, O_RDWR | O_CREAT | PG_BINARY); - pfree(path); - MemSet(buffer, 0, BLCKSZ); - return; - } + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); @@ -836,13 +815,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) BlockNumber nblocks; BlockNumber segno; - /* NEON: md smgr is used in Neon for unlogged and temp relations. - * After compute node restart their data is deleted but unlogged tables are still present in system catalog. - * This is a difference with Vanilla Postgres where unlogged relations are truncated only after abnormal termination. - * To avoid "could not open file" we have to use EXTENSION_RETURN_NULL hear instead of EXTENSION_FAIL - */ - if (!mdopenfork(reln, forknum, RelFileNodeBackendIsTemp(reln->smgr_rnode) ? EXTENSION_FAIL : EXTENSION_RETURN_NULL)) - return 0; + mdopenfork(reln, forknum, EXTENSION_FAIL); /* mdopen has opened the first segment */ Assert(reln->md_num_open_segs[forknum] > 0); From 6da01741318172addc0de6da9e27fa613a9b5f00 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Thu, 23 Feb 2023 23:41:24 +0200 Subject: [PATCH 40/56] Allow external main functions to skip config load and make last written LSN cache optional. --- src/backend/access/transam/xlog.c | 6 +++++- src/backend/main/main.c | 8 ++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index da2e4497e34..dc041a5bf56 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4475,6 +4475,7 @@ XLOGShmemInit(void) XLogCtl = (XLogCtlData *) ShmemInitStruct("XLOG Ctl", XLOGCtlShmemSize(), &foundXLog); + if (lastWrittenLsnCacheSize > 0) { static HASHCTL info; info.keysize = sizeof(BufferTag); @@ -4484,6 +4485,7 @@ XLOGShmemInit(void) &info, HASH_ELEM | HASH_BLOBS); } + localControlFile = ControlFile; ControlFile = (ControlFileData *) ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile); @@ -6111,6 +6113,8 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) XLogRecPtr lsn; LastWrittenLsnCacheEntry* entry; + Assert(lastWrittenLsnCacheSize != 0); + LWLockAcquire(LastWrittenLsnLock, LW_SHARED); /* Maximal last written LSN among all non-cached pages */ @@ -6155,7 +6159,7 @@ GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) { - if (lsn == InvalidXLogRecPtr || n_blocks == 0) + if (lsn == InvalidXLogRecPtr || n_blocks == 0 || lastWrittenLsnCacheSize == 0) return; LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 2b908cb3cc6..79d79feb0d4 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -59,7 +59,7 @@ static void check_root(const char *progname); typedef int (*MainFunc) (int argc, char *argv[]); static int -CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[]) +CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[], bool load_config) { MainFunc main_func; @@ -76,7 +76,7 @@ CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[]) InitializeGUCOptions(); /* Acquire configuration parameters */ - if (!SelectConfigFiles(NULL, progname)) + if (load_config && !SelectConfigFiles(NULL, progname)) exit(1); /* @@ -235,9 +235,9 @@ main(int argc, char *argv[]) PostgresSingleUserMain(argc, argv, strdup(get_user_name_or_exit(progname))); else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0) - CallExtMain("neon_walredo", "WalRedoMain", argc, argv); + CallExtMain("neon_walredo", "WalRedoMain", argc, argv, false); else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0) - CallExtMain("neon", "WalProposerSync", argc, argv); + CallExtMain("neon", "WalProposerSync", argc, argv, true); else PostmasterMain(argc, argv); /* the functions above should not return */ From a029c23e19a9b2deb9da89d2f7272fca5d1474c2 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 6 Mar 2023 19:05:41 +0200 Subject: [PATCH 41/56] Remove walredo-related hacks from InternalIpcMemoryCreate() Now similar kind of hack (using malloc() instead of shmem) is done in the wal-redo extension. --- src/backend/port/sysv_shmem.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 43a13e1d652..ea287c733df 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -156,21 +156,6 @@ InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) } #endif - /* - * NEON: do not create shared memory segments for single user wal redo - * postgres. Many spawned instances of wal redo may exhaust kernel.shmmni - */ - if (am_wal_redo_postgres) - { - void *ptr = malloc(size); - - if (ptr == NULL) - { - ereport(FATAL, - (errmsg("could not create shared memory segment with size %zu for WAL redo process", size))); - } - return ptr; - } shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); if (shmid < 0) From 78481ee651ec9cf4e2e99db8af40eab574eb0ff5 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 27 Mar 2023 17:52:10 +0300 Subject: [PATCH 42/56] Adjust prefetch target for parallel bitmap scan (#274) * Adjust prefetch target for parallel bitmap scan * More fixes for parallel bitmap scan prefetch --- src/backend/executor/nodeBitmapHeapscan.c | 30 +++-------------------- 1 file changed, 4 insertions(+), 26 deletions(-) diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 1cb11abb701..c0cbe9d2afc 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -154,15 +154,8 @@ BitmapHeapNext(BitmapHeapScanState *node) node->prefetch_request_pos = 0; if (node->prefetch_maximum > 0) { - pstate->prefetch_iterator = - tbm_prepare_shared_iterate(tbm); - - /* - * We don't need the mutex here as we haven't yet woke up - * others. - */ - pstate->prefetch_pages = 0; - pstate->prefetch_target = -1; + node->prefetch_pages = 0; + node->prefetch_target = -1; } #endif @@ -264,19 +257,8 @@ BitmapHeapNext(BitmapHeapScanState *node) * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ - if (!pstate) - { - if (node->prefetch_target < node->prefetch_maximum) - node->prefetch_target++; - } - else if (pstate->prefetch_target < node->prefetch_maximum) - { - /* take spinlock while updating shared state */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target < node->prefetch_maximum) - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } + if (node->prefetch_target < node->prefetch_maximum) + node->prefetch_target++; #endif /* USE_PREFETCH */ } @@ -401,10 +383,6 @@ static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) { #ifdef USE_PREFETCH - /* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */ - if (node->pstate != NULL) - return; - if (node->prefetch_target >= node->prefetch_maximum) /* don't increase any further */ ; else if (node->prefetch_target >= node->prefetch_maximum / 2) From 8a6fd670524b8c3bcd2a2c871ba88651eb26add7 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 7 Apr 2023 08:46:47 +0300 Subject: [PATCH 43/56] Copy iterator result in BitmapHeapNext (#276) --- src/backend/executor/nodeBitmapHeapscan.c | 50 +++++++++++------------ src/include/nodes/execnodes.h | 4 +- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index c0cbe9d2afc..95dfef466f6 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -149,19 +149,15 @@ BitmapHeapNext(BitmapHeapScanState *node) * multiple processes to iterate jointly. */ pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); -#ifdef USE_PREFETCH - node->n_prefetch_requests = 0; - node->prefetch_request_pos = 0; - if (node->prefetch_maximum > 0) - { - node->prefetch_pages = 0; - node->prefetch_target = -1; - } -#endif /* We have initialized the shared state so wake up others. */ BitmapDoneInitializingSharedState(pstate); } +#ifdef USE_PREFETCH + node->prefetch_head = 0; + node->prefetch_pages = 0; + node->prefetch_target = -1; +#endif /* Allocate a private iterator and attach the shared state to it */ node->shared_tbmiterator = shared_tbmiterator = @@ -184,20 +180,25 @@ BitmapHeapNext(BitmapHeapScanState *node) if (tbmres == NULL) { if (!pstate) - node->tbmres = tbmres = tbm_iterate(tbmiterator); + tbmres = tbm_iterate(tbmiterator); else { - if (node->n_prefetch_requests != 0) + if (node->prefetch_pages != 0) { - node->tbmres = tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_request_pos]; - node->n_prefetch_requests -= 1; - node->prefetch_request_pos = (node->prefetch_request_pos + 1) % MAX_IO_CONCURRENCY; - if (node->prefetch_pages != 0) - node->prefetch_pages -= 1; + tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_head]; + node->prefetch_pages -= 1; + node->prefetch_head = (node->prefetch_head + 1) % MAX_IO_CONCURRENCY; } else - node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); + tbmres = tbm_shared_iterate(shared_tbmiterator); + if (tbmres) + { + /* Need to copy result because iterator can be used for prefetch and vocant position in prefetch ring buffer can also be reused */ + memcpy(&node->tbmres_copy, tbmres, offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmres->ntuples, 0)); + tbmres = (TBMIterateResult *)&node->tbmres_copy; + } } + node->tbmres = tbmres; if (tbmres == NULL) { /* no more entries in the bitmap */ @@ -236,7 +237,6 @@ BitmapHeapNext(BitmapHeapScanState *node) /* AM doesn't think this block is valid, skip */ continue; } - if (tbmres->ntuples >= 0) node->exact_pages++; else @@ -455,8 +455,7 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) if (node->prefetch_pages < node->prefetch_target) { - Assert(node->n_prefetch_requests < MAX_IO_CONCURRENCY); - node->prefetch_pages++; + Assert(node->prefetch_pages < MAX_IO_CONCURRENCY); do_prefetch = true; } @@ -466,8 +465,10 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) tbmpre = tbm_shared_iterate(node->shared_tbmiterator); if (tbmpre != NULL) { - memcpy(&node->prefetch_requests[(node->prefetch_request_pos + node->n_prefetch_requests) % MAX_IO_CONCURRENCY], tbmpre, sizeof(TBMIteratePrefetchResult)); - node->n_prefetch_requests += 1; + memcpy(&node->prefetch_requests[(node->prefetch_head + node->prefetch_pages) % MAX_IO_CONCURRENCY], + tbmpre, + offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmpre->ntuples, 0)); + node->prefetch_pages += 1; } else { @@ -477,7 +478,7 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) /* As above, skip prefetch if we expect not to need page */ skip_fetch = (node->can_skip_fetch && - (node->tbmres ? !node->tbmres->recheck : false) && + !tbmpre->recheck && VM_ALL_VISIBLE(node->ss.ss_currentRelation, tbmpre->blockno, &node->pvmbuffer)); @@ -715,8 +716,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) * Maximum number of prefetches for the tablespace if configured, * otherwise the current value of the effective_io_concurrency GUC. */ - scanstate->prefetch_maximum = - get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); + scanstate->prefetch_maximum = get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); scanstate->ss.ss_currentRelation = currentRelation; diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index ecd313880a2..c373c7d2663 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1745,8 +1745,8 @@ typedef struct BitmapHeapScanState TBMSharedIterator *shared_tbmiterator; /* parallel worker private ring buffer with prefetch requests: it allows to access prefetch result from the same worker */ TBMIteratePrefetchResult prefetch_requests[MAX_IO_CONCURRENCY]; - int n_prefetch_requests; /* number of used elements in prefetch_requests ring buffer */ - int prefetch_request_pos; /* head position in ring buffer */ + TBMIteratePrefetchResult tbmres_copy; /* copy of current iterator result */ + int prefetch_head; /* head position in ring buffer */ ParallelBitmapHeapState *pstate; } BitmapHeapScanState; From 0e771f95245f39e350ace3e7396dd20b9239d76c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 7 Apr 2023 08:49:46 +0300 Subject: [PATCH 44/56] Prefetch for index and index-only scans (#271) * Pefeth for index and inex-only scans * Remove debug logging * Move prefetch_blocks array to the end of BTScanOpaqueData struct --- src/backend/access/nbtree/README | 44 +++++ src/backend/access/nbtree/nbtinsert.c | 2 +- src/backend/access/nbtree/nbtree.c | 1 + src/backend/access/nbtree/nbtsearch.c | 214 ++++++++++++++++++++++++- src/backend/optimizer/path/costsize.c | 2 + src/backend/utils/misc/guc.c | 20 +++ src/include/access/nbtree.h | 17 ++ src/include/optimizer/cost.h | 3 + src/test/regress/expected/sysviews.out | 4 +- 9 files changed, 301 insertions(+), 6 deletions(-) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 5529afc1fed..5cd7578e27f 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1081,3 +1081,47 @@ item is irrelevant, and need not be stored at all. This arrangement corresponds to the fact that an L&Y non-leaf page has one more pointer than key. Suffix truncation's negative infinity attributes behave in the same way. + +Notes About Index Scan Prefetch +------------------------------- + +Prefetch can significantly improve the speed of OLAP queries. +To be able to perform prefetch, we need to know which pages will +be accessed during the scan. It is trivial for heap- and bitmap scans, +but requires more effort for index scans: to implement prefetch for +index scans, we need to find out subsequent leaf pages. + +Postgres links all pages at the same level of the B-Tree in a doubly linked list and uses this list for +forward and backward iteration. This list, however, can not trivially be used for prefetching because to locate the next page because we need first to load the current page. To prefetch more than only the next page, we can utilize the parent page's downlinks instead, as it contains references to most of the target page's sibling pages. + +Because Postgres' nbtree pages have no reference to their parent page, we need to remember the parent page when descending the btree and use it to prefetch subsequent pages. We will utilize the parent's linked list to improve the performance of this prefetch system past the key range of the parent page. + +We should prefetch not only leaf pages, but also the next parent page. +The trick is to correctly calculate the moment when it will be needed: +We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page. + +Currently there are two different prefetch implementations for +index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches +only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only +if parallel plan is not used. Parallel index scan is using critical section for obtaining next +page by parallel worker. Leaf page is loaded in this critical section. +And if most of time is spent in loading the page, then it actually eliminates any concurrency +and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in +any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0. + +Prefetch for normal (not index-only) index tries to prefetch heap tuples +referenced from leaf page. Average number of items per page +is about 100 which is comparable with default value of effective_io_concurrency. +So there is not so much sense trying to prefetch also next leaf page. + +As far as it is difficult to estimate number of entries traversed by index scan, +we prefer not to prefetch large number of pages from the very beginning. +Such useless prefetch can reduce the performance of point lookups. +Instead of it we start with smallest prefetch distance and increase it +by INCREASE_PREFETCH_DISTANCE_STEP after processing each item +until it reaches effective_io_concurrency. In case of index-only +scan we increase prefetch distance after processing each leaf pages +and for index scan - after processing each tuple. +The only exception is case when no key bounds are specified. +In this case we traverse the whole relation and it makes sense +to start with the largest possible prefetch distance from the very beginning. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index f6f4af8bfe3..6bb34d2f4f7 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -2157,7 +2157,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index f5c11e31a1c..11e346af508 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -368,6 +368,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + so->prefetch_maximum = 0; /* disable prefetch */ /* * We don't know yet whether the scan will be index-only, so we do not diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index baab42a9da4..a35d68e395a 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -18,12 +18,14 @@ #include "access/nbtree.h" #include "access/relscan.h" #include "access/xact.h" +#include "catalog/catalog.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "storage/predicate.h" #include "utils/lsyscache.h" #include "utils/rel.h" - +#include "utils/spccache.h" static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); @@ -47,6 +49,7 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +#define INCREASE_PREFETCH_DISTANCE_STEP 1 /* * _bt_drop_lock_and_maybe_pin() @@ -837,6 +840,70 @@ _bt_compare(Relation rel, return 0; } + +/* + * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch. + * This functions returns offset of first item. + */ +static int +_bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + OffsetNumber n_child; + int next_parent_prefetch_index; + int i, j; + + buf = _bt_getbuf(rel, parent, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + offnum = P_FIRSTDATAKEY(opaque); + n_child = PageGetMaxOffsetNumber(page) - offnum + 1; + + /* Position where we should insert prefetch of parent page: we intentionally use prefetch_maximum here instead of current_prefetch_distance, + * assuming that it will reach prefetch_maximum before we reach and of the parent page + */ + next_parent_prefetch_index = (n_child > so->prefetch_maximum) + ? n_child - so->prefetch_maximum : 0; + + if (ScanDirectionIsForward(dir)) + { + so->next_parent = opaque->btpo_next; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + i); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + else + { + so->next_parent = opaque->btpo_prev; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + so->n_prefetch_blocks = j; + so->last_prefetch_index = 0; + _bt_relbuf(rel, buf); + return offnum; +} + /* * _bt_first() -- Find the first item in a scan. * @@ -1096,6 +1163,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } + /* Neon: initialize prefetch */ + so->n_prefetch_requests = 0; + so->n_prefetch_blocks = 0; + so->last_prefetch_index = 0; + so->next_parent = P_NONE; + so->prefetch_maximum = IsCatalogRelation(rel) + ? effective_io_concurrency + : get_tablespace_io_concurrency(rel->rd_rel->reltablespace); + + if (scan->xs_want_itup) /* index only scan */ + { + if (enable_indexonlyscan_prefetch) + { + /* We disable prefetch for parallel index-only scan. + * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker + * which issued prefetch request. The logic of splitting pages between parallel workers in + * index scan doesn't allow to satisfy this requirement. + * Also prefetch of leave pages will be useless if expected number of rows fits in one page. + */ + if (scan->parallel_scan) + so->prefetch_maximum = 0; /* disable prefetch */ + } + else + so->prefetch_maximum = 0; /* disable prefetch */ + } + else if (!enable_indexscan_prefetch || !scan->heapRelation) + so->prefetch_maximum = 0; /* disable prefetch */ + + /* If key bounds are not specified, then we will scan the whole relation and it make sense to start with the largest possible prefetch distance */ + so->current_prefetch_distance = (keysCount == 0) ? so->prefetch_maximum : 0; + /* * If we found no usable boundary keys, we have to start from one end of * the tree. Walk down that edge to the first or last key, and scan from @@ -1366,6 +1464,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); + /* Start prefetching for index only scan */ + if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */ + { + int first_offset = _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir); + int skip = ScanDirectionIsForward(dir) + ? stack->bts_offset - first_offset + : first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset; + Assert(so->n_prefetch_blocks >= skip); + so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; + so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); + so->last_prefetch_index = skip + so->n_prefetch_requests; + for (int i = skip; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } + /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1505,9 +1618,63 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_heaptid = currItem->heapTid; - if (scan->xs_want_itup) + if (scan->xs_want_itup) /* index-only scan */ + { scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + } + else if (so->prefetch_maximum > 0) + { + int prefetchLimit, prefetchDistance; + + /* Neon: prefetch referenced heap pages. + * As far as it is difficult to predict how much items index scan will return + * we do not want to prefetch many heap pages from the very beginning because + * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP + * at each index scan iteration until it reaches prefetch_maximum. + */ + + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + else + so->current_prefetch_distance = so->prefetch_maximum; + + /* How much we can prefetch */ + prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1); + + /* Active prefeth requests */ + prefetchDistance = so->n_prefetch_requests; + /* + * Consume one prefetch request (if any) + */ + if (prefetchDistance != 0) + prefetchDistance -= 1; + + /* Keep number of active prefetch requests equal to the current prefetch distance. + * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration, + * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations + */ + if (ScanDirectionIsForward(dir)) + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + else + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */ + } return true; } @@ -1914,6 +2081,30 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->markItemIndex = -1; } + if (scan->xs_want_itup && so->prefetch_maximum > 0) /* Prefetching of leave pages for index-only scan */ + { + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + + so->n_prefetch_requests -= 1; /* we load next leaf page, so decrement number of active prefetch requests */ + + /* Check if the are more children to prefetch at current parent page */ + if (so->last_prefetch_index == so->n_prefetch_blocks && so->next_parent != P_NONE) + { + /* we have prefetched all items from current parent page, let's move to the next parent page */ + _bt_read_parent_for_prefetch(scan, so->next_parent, dir); + so->n_prefetch_requests -= 1; /* loading parent page consumes one more prefetch request */ + } + + /* Try to keep number of active prefetch requests equal to current prefetch distance */ + while (so->n_prefetch_requests < so->current_prefetch_distance && so->last_prefetch_index < so->n_prefetch_blocks) + { + so->n_prefetch_requests += 1; + PrefetchBuffer(scan->indexRelation, MAIN_FORKNUM, so->prefetch_blocks[so->last_prefetch_index++]); + } + } + if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ @@ -2318,6 +2509,7 @@ _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) */ Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent, Snapshot snapshot) { Buffer buf; @@ -2326,6 +2518,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; + BlockNumber parent_blocknum = P_NONE; /* * If we are looking for a leaf page, okay to descend from fast root; @@ -2343,6 +2536,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, page = BufferGetPage(buf); TestForOldSnapshot(snapshot, rel, page); opaque = BTPageGetOpaque(page); + blkno = BufferGetBlockNumber(buf); for (;;) { @@ -2381,12 +2575,15 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, offnum = P_FIRSTDATAKEY(opaque); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + parent_blocknum = blkno; blkno = BTreeTupleGetDownLink(itup); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); } + if (parent) + *parent = parent_blocknum; return buf; } @@ -2410,13 +2607,13 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) BTPageOpaque opaque; OffsetNumber start; BTScanPosItem *currItem; - + BlockNumber parent; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ - buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), &parent, scan->xs_snapshot); if (!BufferIsValid(buf)) { @@ -2429,6 +2626,15 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } + /* Start prefetching for index-only scan */ + if (so->prefetch_maximum > 0 && parent != P_NONE && scan->xs_want_itup) /* index only scan */ + { + _bt_read_parent_for_prefetch(scan, parent, dir); + so->n_prefetch_requests = so->last_prefetch_index = Min(so->prefetch_maximum, so->n_prefetch_blocks); + for (int i = 0; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 20cde3d0aff..bf67672a743 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -153,6 +153,8 @@ bool enable_parallel_hash = true; bool enable_partition_pruning = true; bool enable_async_append = true; bool enable_seqscan_prefetch = true; +bool enable_indexscan_prefetch = true; +bool enable_indexonlyscan_prefetch = true; typedef struct { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 3eced8fa29c..8d05ffa1705 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1024,6 +1024,26 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of heap pages in index scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexscan_prefetch, + true, + NULL, NULL, NULL + }, + { + {"enable_indexonlyscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of leave pages in index-only scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexonlyscan_prefetch, + true, + NULL, NULL, NULL + }, { {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of sequential-scan plans."), diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 93f8267b483..f50c43cfadd 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1069,6 +1069,22 @@ typedef struct BTScanOpaqueData /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ + + /* Neon: prefetch state */ + int prefetch_maximum; /* maximal number of prefetch requests */ + + /* Prefech of referenced heap pages for index scan */ + /* To minimize waste prefetch requests we start with prefetch distance 0 + * and increase it until it reaches prefetch_maximum + */ + int current_prefetch_distance; + + /* Prefetch of leave pages of B-Tree for index-only scan */ + int n_prefetch_requests; /* number of active prefetch requests */ + int n_prefetch_blocks; /* number of elements in prefetch_blocks */ + int last_prefetch_index; /* current position in prefetch_blocks (prefetch_blocks[0..last_prefetch_index] are already requested */ + BlockNumber next_parent; /* pointer to next parent page */ + BlockNumber prefetch_blocks[MaxTIDsPerBTreePage + 1]; /* leaves + parent page */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -1232,6 +1248,7 @@ extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent, Snapshot snapshot); /* diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index d6a15292da6..f7c33b7e658 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -70,6 +70,9 @@ extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; extern PGDLLIMPORT bool enable_async_append; extern PGDLLIMPORT bool enable_seqscan_prefetch; +extern PGDLLIMPORT bool enable_indexscan_prefetch; +extern PGDLLIMPORT bool enable_indexonlyscan_prefetch; + extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index eef4f7e1d11..231283c5636 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -118,7 +118,9 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashjoin | on enable_incremental_sort | on enable_indexonlyscan | on + enable_indexonlyscan_prefetch | on enable_indexscan | on + enable_indexscan_prefetch | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -132,7 +134,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan_prefetch | on enable_sort | on enable_tidscan | on -(21 rows) +(23 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail From 52c0264fbef7d00cbe70d30f56d79b63a40445e0 Mon Sep 17 00:00:00 2001 From: MMeent Date: Thu, 13 Apr 2023 22:42:25 +0200 Subject: [PATCH 45/56] [PG15] Feature/replicas (#279) * Recovery requirements: Add condition variable for WAL recovery; allowing backends to wait for recovery up to some record pointer. * Fix issues w.r.t. WAL when LwLsn is initiated and when recovery starts. This fixes some test failures that showed up after updating Neon code to do more precise handling of replica's get_page_at_lsn's request_lsn lsns. --------- Co-authored-by: Matthias van de Meent --- src/backend/access/transam/xlog.c | 19 +++++-- src/backend/access/transam/xlogrecovery.c | 62 +++++++++++++++++++++++ src/include/access/xlogrecovery.h | 1 + src/include/access/xlogutils.h | 4 ++ 4 files changed, 81 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index dc041a5bf56..c908b07bf0e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5303,6 +5303,14 @@ StartupXLOG(void) RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; doPageWrites = lastFullPageWrites; + /* + * Setup last written lsn cache, max written LSN. + * Starting from here, we could be modifying pages through REDO, which requires + * the existance of maxLwLsn + LwLsn LRU. + */ + XLogCtl->maxLastWrittenLsn = RedoRecPtr; + dlist_init(&XLogCtl->lastWrittenLsnLRU); + /* REDO */ if (InRecovery) { @@ -5671,8 +5679,6 @@ StartupXLOG(void) XLogCtl->LogwrtRqst.Write = EndOfLog; XLogCtl->LogwrtRqst.Flush = EndOfLog; - XLogCtl->maxLastWrittenLsn = EndOfLog; - dlist_init(&XLogCtl->lastWrittenLsnLRU); /* * Preallocate additional log files, if wanted. @@ -8148,11 +8154,14 @@ xlog_redo(XLogReaderState *record) continue; } result = XLogReadBufferForRedo(record, block_id, &buffer); - if (result == BLK_DONE && !IsUnderPostmaster) + if (result == BLK_DONE && (!IsUnderPostmaster || StandbyMode)) { /* - * In the special WAL process, blocks that are being ignored - * return BLK_DONE. Accept that. + * NEON: In the special WAL redo process, blocks that are being + * ignored return BLK_DONE. Accept that. + * Additionally, in standby mode, blocks that are not present + * in shared buffers are ignored during replay, so we also + * ignore those blocks. */ } else if (result != BLK_RESTORED) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index d986384ff02..a6a96dcc93b 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -336,6 +336,7 @@ typedef struct XLogRecoveryCtlData XLogRecPtr lastReplayedReadRecPtr; /* start position */ XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */ TimeLineID lastReplayedTLI; /* timeline */ + ConditionVariable replayProgressCV; /* CV for waiters */ /* * When we're currently replaying a record, ie. in a redo function, @@ -465,6 +466,7 @@ XLogRecoveryShmemInit(void) SpinLockInit(&XLogRecoveryCtl->info_lck); InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + ConditionVariableInit(&XLogRecoveryCtl->replayProgressCV); ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV); } @@ -486,6 +488,64 @@ EnableStandbyMode(void) disable_startup_progress_timeout(); } +/* + * Wait for recovery to complete replaying all WAL up to and including + * redoEndRecPtr. + * + * This gets woken up for every WAL record replayed, so make sure you're not + * trying to wait an LSN that is too far in the future. + */ +void +XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr) +{ + static XLogRecPtr replayRecPtr = 0; + + if (!RecoveryInProgress()) + return; + + /* + * Check the backend-local variable first, we may be able to skip accessing + * shared memory (which requires locking) + */ + if (redoEndRecPtr <= replayRecPtr) + return; + + replayRecPtr = GetXLogReplayRecPtr(NULL); + + /* + * Check again if we're going to need to wait, now that we've updated + * the local cached variable. + */ + if (redoEndRecPtr <= replayRecPtr) + return; + + /* + * We need to wait for the variable, so prepare for that. + * + * Note: This wakes up every time a WAL record is replayed, so this can + * be expensive. + */ + ConditionVariablePrepareToSleep(&XLogRecoveryCtl->replayProgressCV); + + while (redoEndRecPtr > replayRecPtr) + { + bool timeout; + timeout = ConditionVariableTimedSleep(&XLogRecoveryCtl->replayProgressCV, + 10000000, /* 10 seconds */ + WAIT_EVENT_RECOVERY_WAL_STREAM); + + replayRecPtr = GetXLogReplayRecPtr(NULL); + + if (timeout) + ereport(LOG, + (errmsg("Waiting for recovery to catch up to %X/%X (currently %X/%X)", + LSN_FORMAT_ARGS(redoEndRecPtr), + LSN_FORMAT_ARGS(replayRecPtr)))); + } + + ConditionVariableCancelSleep(); +} + /* * Prepare the system for WAL recovery, if needed. * @@ -2051,6 +2111,8 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl /* Reset the prefetcher. */ XLogPrefetchReconfigure(); } + + ConditionVariableBroadcast(&XLogRecoveryCtl->replayProgressCV); } /* diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 0aa85d90e89..48eaa8bcbf1 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -135,6 +135,7 @@ extern void ShutdownWalRecovery(void); extern void RemovePromoteSignalFiles(void); extern bool HotStandbyActive(void); +extern void XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); extern RecoveryPauseState GetRecoveryPauseState(void); extern void SetRecoveryPause(bool recoveryPause); diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 57cb9d84215..15f155238a0 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -81,6 +81,10 @@ typedef struct ReadLocalXLogPageNoWaitPrivate bool end_of_wal; /* true, when end of WAL is reached */ } ReadLocalXLogPageNoWaitPrivate; +/* + * Returns true if we shouldn't do REDO on that block in record indicated by + * block_id; false otherwise. + */ extern bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, From 48ebdaf79be687c82d5f3283707bb1a0e0a76126 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 5 May 2023 13:49:46 +0100 Subject: [PATCH 46/56] Fix entering hot standby mode for Neon --- src/backend/access/transam/xlogrecovery.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index a6a96dcc93b..25f5da7dea3 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1135,7 +1135,7 @@ readRecoverySignalFile(void) if (standby_signal_file_found) { StandbyModeRequested = true; - ArchiveRecoveryRequested = XLogRecPtrIsInvalid(zenithLastRec); /* no need to perform WAL recovery in Neon */ + ArchiveRecoveryRequested = true; } else if (recovery_signal_file_found) { @@ -1786,7 +1786,10 @@ PerformWalRecovery(void) else { /* just have to read next record after CheckPoint */ - Assert(xlogreader->ReadRecPtr == CheckPointLoc); + if (ZenithRecoveryRequested) + xlogreader->ReadRecPtr = CheckPointLoc; + else + Assert(xlogreader->ReadRecPtr == CheckPointLoc); replayTLI = CheckPointTLI; record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); } From ac6ca4df7a55cc0e3c6d552bcf3fe02179011ff4 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 7 Jul 2023 13:26:04 +0300 Subject: [PATCH 47/56] Do not allow users with CREATEROLE privelege to manage system user groups. --- src/backend/commands/user.c | 75 ++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c index cba8e1979b6..15752674105 100644 --- a/src/backend/commands/user.c +++ b/src/backend/commands/user.c @@ -55,7 +55,8 @@ static void AddRoleMems(const char *rolename, Oid roleid, static void DelRoleMems(const char *rolename, Oid roleid, List *memberSpecs, List *memberIds, bool admin_opt); - +static void check_role_membership_authorization(Oid currentUserId, Oid roleid, + bool is_grant); /* Check if current user has createrole privileges */ static bool @@ -1388,6 +1389,8 @@ AddRoleMems(const char *rolename, Oid roleid, if (!memberIds) return; + check_role_membership_authorization(grantorId, roleid, true); + /* * Check permissions: must have createrole or admin option on the role to * be changed. To mess with a superuser role, you gotta be superuser. @@ -1561,6 +1564,8 @@ DelRoleMems(const char *rolename, Oid roleid, if (!memberIds) return; + check_role_membership_authorization(GetUserId(), roleid, false); + /* * Check permissions: must have createrole or admin option on the role to * be changed. To mess with a superuser role, you gotta be superuser. @@ -1643,3 +1648,71 @@ DelRoleMems(const char *rolename, Oid roleid, */ table_close(pg_authmem_rel, NoLock); } + +/* + * Check that currentUserId has permission to modify the membership list for + * roleid. Throw an error if not. + */ +static void +check_role_membership_authorization(Oid currentUserId, Oid roleid, + bool is_grant) +{ + /* + * The charter of pg_database_owner is to have exactly one, implicit, + * situation-dependent member. There's no technical need for this + * restriction. (One could lift it and take the further step of making + * object_ownercheck(DatabaseRelationId, ...) equivalent to + * has_privs_of_role(roleid, ROLE_PG_DATABASE_OWNER), in which case + * explicit, situation-independent members could act as the owner of any + * database.) + */ + if (is_grant && roleid == ROLE_PG_DATABASE_OWNER) + ereport(ERROR, + errmsg("role \"%s\" cannot have explicit members", + GetUserNameFromId(roleid, false))); + + /* To mess with a superuser role, you gotta be superuser. */ + if (superuser_arg(roleid)) + { + if (!superuser_arg(currentUserId)) + { + if (is_grant) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to grant role \"%s\"", + GetUserNameFromId(roleid, false)), + errdetail("Only roles with the %s attribute may grant roles with the %s attribute.", + "SUPERUSER", "SUPERUSER"))); + else + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to revoke role \"%s\"", + GetUserNameFromId(roleid, false)), + errdetail("Only roles with the %s attribute may revoke roles with the %s attribute.", + "SUPERUSER", "SUPERUSER"))); + } + } + else + { + /* + * Otherwise, must have admin option on the role to be changed. + */ + if (!is_admin_of_role(currentUserId, roleid)) + { + if (is_grant) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to grant role \"%s\"", + GetUserNameFromId(roleid, false)), + errdetail("Only roles with the %s option on role \"%s\" may grant this role.", + "ADMIN", GetUserNameFromId(roleid, false)))); + else + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to revoke role \"%s\"", + GetUserNameFromId(roleid, false)), + errdetail("Only roles with the %s option on role \"%s\" may revoke this role.", + "ADMIN", GetUserNameFromId(roleid, false)))); + } + } +} From 38e3bd261c57227f2af194159b8d248a346e8e99 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Sat, 15 Jul 2023 20:42:52 +0300 Subject: [PATCH 48/56] Fix regression tests after the patch with CREATEROLE restrictions --- src/test/regress/expected/create_role.out | 45 +++++++++++++++++++++-- src/test/regress/expected/privileges.out | 15 +++++--- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/test/regress/expected/create_role.out b/src/test/regress/expected/create_role.out index 4e67d727603..eac155396eb 100644 --- a/src/test/regress/expected/create_role.out +++ b/src/test/regress/expected/create_role.out @@ -24,7 +24,8 @@ CREATE ROLE regress_noiseword SYSID 12345; NOTICE: SYSID can no longer be specified -- fail, cannot grant membership in superuser role CREATE ROLE regress_nosuch_super IN ROLE regress_role_super; -ERROR: must be superuser to alter superusers +ERROR: permission denied to grant role "regress_role_super" +DETAIL: Only roles with the SUPERUSER attribute may grant roles with the SUPERUSER attribute. -- fail, database owner cannot have members CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner; ERROR: role "pg_database_owner" cannot have explicit members @@ -32,16 +33,22 @@ ERROR: role "pg_database_owner" cannot have explicit members CREATE ROLE regress_inroles ROLE regress_role_super, regress_createdb, regress_createrole, regress_login, regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; +ERROR: permission denied to grant role "regress_inroles" +DETAIL: Only roles with the ADMIN option on role "regress_inroles" may grant this role. -- fail, cannot grant a role into itself CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive; -ERROR: role "regress_nosuch_recursive" is a member of role "regress_nosuch_recursive" +ERROR: permission denied to grant role "regress_nosuch_recursive" +DETAIL: Only roles with the ADMIN option on role "regress_nosuch_recursive" may grant this role. -- ok, can grant other users into a role with admin option CREATE ROLE regress_adminroles ADMIN regress_role_super, regress_createdb, regress_createrole, regress_login, regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; +ERROR: permission denied to grant role "regress_adminroles" +DETAIL: Only roles with the ADMIN option on role "regress_adminroles" may grant this role. -- fail, cannot grant a role into itself with admin option CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive; -ERROR: role "regress_nosuch_admin_recursive" is a member of role "regress_nosuch_admin_recursive" +ERROR: permission denied to grant role "regress_nosuch_admin_recursive" +DETAIL: Only roles with the ADMIN option on role "regress_nosuch_admin_recursive" may grant this role. -- fail, regress_createrole does not have CREATEDB privilege SET SESSION AUTHORIZATION regress_createrole; CREATE DATABASE regress_nosuch_db; @@ -75,15 +82,35 @@ REASSIGN OWNED BY regress_tenant TO regress_createrole; ERROR: permission denied to reassign objects -- ok, having CREATEROLE is enough to create roles in privileged roles CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data; +ERROR: permission denied to grant role "pg_read_all_data" +DETAIL: Only roles with the ADMIN option on role "pg_read_all_data" may grant this role. CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data; +ERROR: permission denied to grant role "pg_write_all_data" +DETAIL: Only roles with the ADMIN option on role "pg_write_all_data" may grant this role. CREATE ROLE regress_monitor IN ROLE pg_monitor; +ERROR: permission denied to grant role "pg_monitor" +DETAIL: Only roles with the ADMIN option on role "pg_monitor" may grant this role. CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings; +ERROR: permission denied to grant role "pg_read_all_settings" +DETAIL: Only roles with the ADMIN option on role "pg_read_all_settings" may grant this role. CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats; +ERROR: permission denied to grant role "pg_read_all_stats" +DETAIL: Only roles with the ADMIN option on role "pg_read_all_stats" may grant this role. CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables; +ERROR: permission denied to grant role "pg_stat_scan_tables" +DETAIL: Only roles with the ADMIN option on role "pg_stat_scan_tables" may grant this role. CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files; +ERROR: permission denied to grant role "pg_read_server_files" +DETAIL: Only roles with the ADMIN option on role "pg_read_server_files" may grant this role. CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files; +ERROR: permission denied to grant role "pg_write_server_files" +DETAIL: Only roles with the ADMIN option on role "pg_write_server_files" may grant this role. CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program; +ERROR: permission denied to grant role "pg_execute_server_program" +DETAIL: Only roles with the ADMIN option on role "pg_execute_server_program" may grant this role. CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend; +ERROR: permission denied to grant role "pg_signal_backend" +DETAIL: Only roles with the ADMIN option on role "pg_signal_backend" may grant this role. -- fail, creation of these roles failed above so they do not now exist SET SESSION AUTHORIZATION regress_role_admin; DROP ROLE regress_nosuch_superuser; @@ -113,18 +140,30 @@ DROP ROLE regress_encrypted_password; DROP ROLE regress_password_null; DROP ROLE regress_noiseword; DROP ROLE regress_inroles; +ERROR: role "regress_inroles" does not exist DROP ROLE regress_adminroles; +ERROR: role "regress_adminroles" does not exist DROP ROLE regress_rolecreator; DROP ROLE regress_read_all_data; +ERROR: role "regress_read_all_data" does not exist DROP ROLE regress_write_all_data; +ERROR: role "regress_write_all_data" does not exist DROP ROLE regress_monitor; +ERROR: role "regress_monitor" does not exist DROP ROLE regress_read_all_settings; +ERROR: role "regress_read_all_settings" does not exist DROP ROLE regress_read_all_stats; +ERROR: role "regress_read_all_stats" does not exist DROP ROLE regress_stat_scan_tables; +ERROR: role "regress_stat_scan_tables" does not exist DROP ROLE regress_read_server_files; +ERROR: role "regress_read_server_files" does not exist DROP ROLE regress_write_server_files; +ERROR: role "regress_write_server_files" does not exist DROP ROLE regress_execute_server_program; +ERROR: role "regress_execute_server_program" does not exist DROP ROLE regress_signal_backend; +ERROR: role "regress_signal_backend" does not exist -- fail, role still owns database objects DROP ROLE regress_tenant; ERROR: role "regress_tenant" cannot be dropped because some objects depend on it diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 03df567d50f..554aa5c4986 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -1679,7 +1679,8 @@ REFRESH MATERIALIZED VIEW sro_mv; ERROR: cannot fire deferred trigger within security-restricted operation CONTEXT: SQL function "mv_action" statement 1 BEGIN; SET CONSTRAINTS ALL IMMEDIATE; REFRESH MATERIALIZED VIEW sro_mv; COMMIT; -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. CONTEXT: SQL function "unwanted_grant" statement 1 SQL statement "SELECT unwanted_grant()" PL/pgSQL function sro_trojan() line 1 at PERFORM @@ -1709,10 +1710,12 @@ CREATE FUNCTION dogrant_ok() RETURNS void LANGUAGE sql SECURITY DEFINER AS GRANT regress_priv_group2 TO regress_priv_user5; -- ok: had ADMIN OPTION SET ROLE regress_priv_group2; GRANT regress_priv_group2 TO regress_priv_user5; -- fails: SET ROLE suspended privilege -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. SET SESSION AUTHORIZATION regress_priv_user1; GRANT regress_priv_group2 TO regress_priv_user5; -- fails: no ADMIN OPTION -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. SELECT dogrant_ok(); -- ok: SECURITY DEFINER conveys ADMIN NOTICE: role "regress_priv_user5" is already a member of role "regress_priv_group2" dogrant_ok @@ -1722,10 +1725,12 @@ NOTICE: role "regress_priv_user5" is already a member of role "regress_priv_gro SET ROLE regress_priv_group2; GRANT regress_priv_group2 TO regress_priv_user5; -- fails: SET ROLE did not help -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. SET SESSION AUTHORIZATION regress_priv_group2; GRANT regress_priv_group2 TO regress_priv_user5; -- fails: no self-admin -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. SET SESSION AUTHORIZATION regress_priv_user4; DROP FUNCTION dogrant_ok(); REVOKE regress_priv_group2 FROM regress_priv_user5; From f93c7259077f4dc78e25cd8faf6f7b6df3e3bc70 Mon Sep 17 00:00:00 2001 From: bojanserafimov Date: Mon, 17 Jul 2023 11:09:57 -0400 Subject: [PATCH 49/56] Add startup logs (#293) --- src/backend/postmaster/postmaster.c | 5 +++++ src/backend/utils/init/miscinit.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 892d42c63ee..5c72554bf70 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1023,6 +1023,7 @@ PostmasterMain(int argc, char *argv[]) /* * process any libraries that should be preloaded at postmaster start */ + ereport(LOG, (errmsg("postgres processing shared_preload_libraries"))); process_shared_preload_libraries(); /* @@ -1045,7 +1046,9 @@ PostmasterMain(int argc, char *argv[]) /* * Give preloaded libraries a chance to request additional shared memory. */ + ereport(LOG, (errmsg("postgres processing shmem request"))); process_shmem_requests(); + ereport(LOG, (errmsg("postgres done processing shmem request"))); /* * Now that loadable modules have had their chance to request additional @@ -1083,7 +1086,9 @@ PostmasterMain(int argc, char *argv[]) /* * Set up shared memory and semaphores. */ + ereport(LOG, (errmsg("postgres setting up shared memory"))); reset_shared(); + ereport(LOG, (errmsg("postgres done setting up shared memory"))); /* * Estimate number of openable files. This must happen after setting up diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index b25bd0e5838..602153b26c4 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1666,7 +1666,7 @@ load_libraries(const char *libraries, const char *gucname, bool restricted) filename = expanded; } load_file(filename, restricted); - ereport(DEBUG1, + ereport(LOG, (errmsg_internal("loaded library \"%s\"", filename))); if (expanded) pfree(expanded); From 4834eb7beda9de9d86d4bca330588772bef1765e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 27 Jul 2023 09:51:41 +0300 Subject: [PATCH 50/56] Make it possible to grant self created roles (#298) Co-authored-by: Konstantin Knizhnik --- src/backend/commands/user.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c index 15752674105..d3ed92b7b75 100644 --- a/src/backend/commands/user.c +++ b/src/backend/commands/user.c @@ -460,6 +460,37 @@ CreateRole(ParseState *pstate, CreateRoleStmt *stmt) } } + /* + * If the current user isn't a superuser, make them an admin of the new + * role so that they can administer the new object they just created. + * Superusers will be able to do that anyway. + * + * The grantor of record for this implicit grant is the bootstrap + * superuser, which means that the CREATEROLE user cannot revoke the + * grant. They can however grant the created role back to themselves with + * different options, since they enjoy ADMIN OPTION on it. + */ + if (!superuser()) + { + RoleSpec *current_role = makeNode(RoleSpec); + List *memberSpecs; + List *memberIds = list_make1_oid(GetUserId()); + + current_role->roletype = ROLESPEC_CURRENT_ROLE; + current_role->location = -1; + memberSpecs = list_make1(current_role); + + AddRoleMems(stmt->role, roleid, + memberSpecs, memberIds, + BOOTSTRAP_SUPERUSERID, true); + + /* + * We must make the implicit grant visible to the code below, else the + * additional grants will fail. + */ + CommandCounterIncrement(); + } + /* * Add the specified members to this new role. adminmembers get the admin * option, rolemembers don't. @@ -1429,7 +1460,7 @@ AddRoleMems(const char *rolename, Oid roleid, * present. Nonetheless, inasmuch as users might look to it for a crude * audit trail, let only superusers impute the grant to a third party. */ - if (grantorId != GetUserId() && !superuser()) + if (grantorId != GetUserId() && grantorId != BOOTSTRAP_SUPERUSERID && !superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to set grantor"))); From 362e8418586f033424a5746c0288dc9d8e56b266 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 28 Jul 2023 14:45:44 +0300 Subject: [PATCH 51/56] Update expected file for create_role test (#301) * Make it possible to grant self created roles * Update expected file for create_role test --------- Co-authored-by: Konstantin Knizhnik --- src/test/regress/expected/create_role.out | 26 ++++++----------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/test/regress/expected/create_role.out b/src/test/regress/expected/create_role.out index eac155396eb..5c42c333dcc 100644 --- a/src/test/regress/expected/create_role.out +++ b/src/test/regress/expected/create_role.out @@ -33,22 +33,16 @@ ERROR: role "pg_database_owner" cannot have explicit members CREATE ROLE regress_inroles ROLE regress_role_super, regress_createdb, regress_createrole, regress_login, regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; -ERROR: permission denied to grant role "regress_inroles" -DETAIL: Only roles with the ADMIN option on role "regress_inroles" may grant this role. -- fail, cannot grant a role into itself CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive; -ERROR: permission denied to grant role "regress_nosuch_recursive" -DETAIL: Only roles with the ADMIN option on role "regress_nosuch_recursive" may grant this role. +ERROR: role "regress_nosuch_recursive" is a member of role "regress_nosuch_recursive" -- ok, can grant other users into a role with admin option CREATE ROLE regress_adminroles ADMIN regress_role_super, regress_createdb, regress_createrole, regress_login, regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null; -ERROR: permission denied to grant role "regress_adminroles" -DETAIL: Only roles with the ADMIN option on role "regress_adminroles" may grant this role. -- fail, cannot grant a role into itself with admin option CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive; -ERROR: permission denied to grant role "regress_nosuch_admin_recursive" -DETAIL: Only roles with the ADMIN option on role "regress_nosuch_admin_recursive" may grant this role. +ERROR: role "regress_nosuch_admin_recursive" is a member of role "regress_nosuch_admin_recursive" -- fail, regress_createrole does not have CREATEDB privilege SET SESSION AUTHORIZATION regress_createrole; CREATE DATABASE regress_nosuch_db; @@ -68,18 +62,13 @@ REVOKE ALL PRIVILEGES ON tenant_table FROM PUBLIC; -- fail, these objects belonging to regress_tenant SET SESSION AUTHORIZATION regress_createrole; DROP INDEX tenant_idx; -ERROR: must be owner of index tenant_idx ALTER TABLE tenant_table ADD COLUMN t text; -ERROR: must be owner of table tenant_table DROP TABLE tenant_table; -ERROR: must be owner of table tenant_table ALTER VIEW tenant_view OWNER TO regress_role_admin; -ERROR: must be owner of view tenant_view +ERROR: must be member of role "regress_role_admin" DROP VIEW tenant_view; -ERROR: must be owner of view tenant_view -- fail, cannot take ownership of these objects from regress_tenant REASSIGN OWNED BY regress_tenant TO regress_createrole; -ERROR: permission denied to reassign objects -- ok, having CREATEROLE is enough to create roles in privileged roles CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data; ERROR: permission denied to grant role "pg_read_all_data" @@ -140,9 +129,7 @@ DROP ROLE regress_encrypted_password; DROP ROLE regress_password_null; DROP ROLE regress_noiseword; DROP ROLE regress_inroles; -ERROR: role "regress_inroles" does not exist DROP ROLE regress_adminroles; -ERROR: role "regress_adminroles" does not exist DROP ROLE regress_rolecreator; DROP ROLE regress_read_all_data; ERROR: role "regress_read_all_data" does not exist @@ -166,9 +153,6 @@ DROP ROLE regress_signal_backend; ERROR: role "regress_signal_backend" does not exist -- fail, role still owns database objects DROP ROLE regress_tenant; -ERROR: role "regress_tenant" cannot be dropped because some objects depend on it -DETAIL: owner of table tenant_table -owner of view tenant_view -- fail, cannot drop ourself nor superusers DROP ROLE regress_role_super; ERROR: must be superuser to drop superusers @@ -177,8 +161,12 @@ ERROR: current user cannot be dropped -- ok RESET SESSION AUTHORIZATION; DROP INDEX tenant_idx; +ERROR: index "tenant_idx" does not exist DROP TABLE tenant_table; +ERROR: table "tenant_table" does not exist DROP VIEW tenant_view; +ERROR: view "tenant_view" does not exist DROP ROLE regress_tenant; +ERROR: role "regress_tenant" does not exist DROP ROLE regress_role_admin; DROP ROLE regress_role_super; From fcd0bde0c4be815e9a6eaa23102e75a11e7d18da Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 29 Jul 2023 08:10:26 +0300 Subject: [PATCH 52/56] Define NEON_SMGR in smgr.h to make it possible for extensions to use extetnded Neon SMGR API (#300) Co-authored-by: Konstantin Knizhnik --- src/include/storage/smgr.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index cf1492a5f99..2a29dcd194b 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -20,6 +20,12 @@ struct f_smgr; +/* + * Neon: extended SMGR API. + * This define can be used by extensions to determine that them are built for Neon. + */ +#define NEON_SMGR 1 + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) From 026d6b093d49e25cec44dd04598152329ceac027 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 13 Jun 2023 16:50:59 +0200 Subject: [PATCH 53/56] Request extension files and libraries from compute_ctl --- src/backend/commands/extension.c | 18 ++++++++ src/backend/utils/fmgr/dfmgr.c | 78 ++++++++++++++++++++++++++++++-- src/include/fmgr.h | 6 +++ 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index df6f021c300..74cc7379e77 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -399,6 +399,7 @@ get_extension_script_directory(ExtensionControlFile *control) { char sharepath[MAXPGPATH]; char *result; + struct stat fst; /* * The directory parameter can be omitted, absolute, or relative to the @@ -414,6 +415,16 @@ get_extension_script_directory(ExtensionControlFile *control) result = (char *) palloc(MAXPGPATH); snprintf(result, MAXPGPATH, "%s/%s", sharepath, control->directory); + // If directory does not exist, check remote extension storage + if (stat(result, &fst) < 0) + { + // request download of extension files from for control->directory + if (download_extension_file_hook != NULL) + { + download_extension_file_hook(control->directory, false); + } + } + return result; } @@ -1453,6 +1464,13 @@ CreateExtensionInternal(char *extensionName, * will get us there. */ filename = get_extension_script_filename(pcontrol, NULL, versionName); + + // request download of extension files from compute_ctl + if (download_extension_file_hook != NULL) + { + download_extension_file_hook(extensionName, false); + } + if (stat(filename, &fst) == 0) { /* Easy, no extra scripts */ diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c index 7f9ea972804..b289644a852 100644 --- a/src/backend/utils/fmgr/dfmgr.c +++ b/src/backend/utils/fmgr/dfmgr.c @@ -36,6 +36,7 @@ #include "storage/shmem.h" #include "utils/hsearch.h" +download_extension_file_hook_type download_extension_file_hook = NULL; /* signature for PostgreSQL-specific library init function */ typedef void (*PG_init_t) (void); @@ -79,11 +80,13 @@ static void *internal_load_library(const char *libname); static void incompatible_module_error(const char *libname, const Pg_magic_struct *module_magic_data) pg_attribute_noreturn(); static bool file_exists(const char *name); -static char *expand_dynamic_library_name(const char *name); +static char *expand_dynamic_library_name(const char *name, bool *is_found); static void check_restricted_library_name(const char *name); static char *substitute_libpath_macro(const char *name); static char *find_in_dynamic_libpath(const char *basename); +static void neon_try_load(const char *name); + /* Magic structure that module needs to match to be accepted */ static const Pg_magic_struct magic_data = PG_MODULE_MAGIC_DATA; @@ -108,9 +111,20 @@ load_external_function(const char *filename, const char *funcname, char *fullname; void *lib_handle; void *retval; + bool is_found = true; /* Expand the possibly-abbreviated filename to an exact path name */ - fullname = expand_dynamic_library_name(filename); + fullname = expand_dynamic_library_name(filename, &is_found); + + // if file is not found, try to download it from compute_ctl + if (!is_found && download_extension_file_hook != NULL) + { + // try to download the file + elog(DEBUG3, "load_external_function: try to download file: %s", fullname); + neon_try_load(fullname); + // try to find file locally once again + fullname = expand_dynamic_library_name(filename, &is_found); + } /* Load the shared library, unless we already did */ lib_handle = internal_load_library(fullname); @@ -132,6 +146,47 @@ load_external_function(const char *filename, const char *funcname, return retval; } +void +neon_try_load(const char *name) +{ + bool have_slash; + char *request_name; + + // add .so suffix if it is not present + if (strstr(name, DLSUFFIX) == NULL) + { + request_name = psprintf("%s%s", name, DLSUFFIX); + elog(DEBUG3, "neon_try_load: add DLSUFFIX: %s", request_name); + } + else + { + request_name = pstrdup(name); + elog(DEBUG3, "neon_try_load: DLSUFFIX already present: %s", request_name); + } + + have_slash = (first_dir_separator(request_name) != NULL); + + if (strncmp(request_name, "$libdir/", strlen("$libdir/")) == 0) + { + char *new_request_name = psprintf("%s", request_name + strlen("$libdir/")); + pfree(request_name); + request_name = new_request_name; + + elog(DEBUG3, "neon_try_load: omit $libdir/: %s", request_name); + } + else if (have_slash) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("unexpected path in dynamic library name: %s", + name))); + } + + elog(DEBUG3, "neon_try_load: final request_name: %s", request_name); + + download_extension_file_hook(request_name, true); +} + /* * This function loads a shlib file without looking up any particular * function in it. If the same shlib has previously been loaded, @@ -144,13 +199,24 @@ void load_file(const char *filename, bool restricted) { char *fullname; + bool is_found = true; /* Apply security restriction if requested */ if (restricted) check_restricted_library_name(filename); /* Expand the possibly-abbreviated filename to an exact path name */ - fullname = expand_dynamic_library_name(filename); + fullname = expand_dynamic_library_name(filename, &is_found); + + // if file is not found, try to download it from compute_ctl + if (!is_found && download_extension_file_hook != NULL) + { + // try to download the file + elog(DEBUG3, "load_file: try to download file: %s", fullname); + neon_try_load(fullname); + // try to find file locally once again + fullname = expand_dynamic_library_name(filename, &is_found); + } /* Load the shared library */ (void) internal_load_library(fullname); @@ -168,7 +234,6 @@ lookup_external_function(void *filehandle, const char *funcname) return dlsym(filehandle, funcname); } - /* * Load the specified dynamic-link library file, unless it already is * loaded. Return the pg_dl* handle for the file. @@ -209,6 +274,7 @@ internal_load_library(const char *libname) errmsg("could not access file \"%s\": %m", libname))); + for (file_scanner = file_list; file_scanner != NULL && !SAME_INODE(stat_buf, *file_scanner); @@ -428,7 +494,7 @@ file_exists(const char *name) * The result will always be freshly palloc'd. */ static char * -expand_dynamic_library_name(const char *name) +expand_dynamic_library_name(const char *name, bool *is_found) { bool have_slash; char *new; @@ -474,9 +540,11 @@ expand_dynamic_library_name(const char *name) * If we can't find the file, just return the string as-is. The ensuing * load attempt will fail and report a suitable message. */ + *is_found = false; return pstrdup(name); } + /* * Check a restricted library name. It must begin with "$libdir/plugins/" * and there must not be any directory separators after that (this is diff --git a/src/include/fmgr.h b/src/include/fmgr.h index 5314b737052..d654047a279 100644 --- a/src/include/fmgr.h +++ b/src/include/fmgr.h @@ -778,4 +778,10 @@ extern PGDLLIMPORT fmgr_hook_type fmgr_hook; #define FmgrHookIsNeeded(fn_oid) \ (!needs_fmgr_hook ? false : (*needs_fmgr_hook)(fn_oid)) + + +// download_extension_file_hook (filename, is_library) +typedef bool (*download_extension_file_hook_type) (const char *, bool); +extern PGDLLIMPORT download_extension_file_hook_type download_extension_file_hook; + #endif /* FMGR_H */ From bd06d52601faac2f8e4a30e0b6ef0258e704061b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 14 Aug 2023 11:10:08 +0300 Subject: [PATCH 54/56] Make it possible to detemine WAL format at runtime --- src/backend/access/heap/heapam.c | 59 ++++++++++++++++++++++++-------- src/include/access/heapam_xlog.h | 12 +++++++ 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b46fc6428a6..632dc3a7f40 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -112,6 +112,7 @@ static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_required, bool *copy); +bool heap_xlog_store_cid; /* * Each tuple lock mode has a corresponding heavyweight lock, and one or two @@ -2246,7 +2247,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, } xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self); - xlrec.flags = 0; + xlrec.flags = XLH_INSERT_STORE_CID; if (all_visible_cleared) xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED; if (options & HEAP_INSERT_SPECULATIVE) @@ -2578,7 +2579,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, /* check that the mutually exclusive flags are not both set */ Assert(!(all_visible_cleared && all_frozen_set)); - xlrec->flags = 0; + xlrec->flags = XLH_INSERT_STORE_CID; if (all_visible_cleared) xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED; if (all_frozen_set) @@ -3108,7 +3109,7 @@ heap_delete(Relation relation, ItemPointer tid, if (RelationIsAccessibleInLogicalDecoding(relation)) log_heap_new_cid(relation, &tp); - xlrec.flags = 0; + xlrec.flags = XLH_DELETE_STORE_CID; if (all_visible_cleared) xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED; if (changingPart) @@ -3844,8 +3845,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, xlrec.locking_xid = xmax_lock_old_tuple; xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask, oldtup.t_data->t_infomask2); - xlrec.flags = - cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + xlrec.flags = XLH_LOCK_STORE_CID | + (cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0); xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data); XLogRegisterData((char *) &xlrec, SizeOfHeapLock); recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); @@ -5033,7 +5034,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, xlrec.locking_xid = xid; xlrec.infobits_set = compute_infobits(new_infomask, tuple->t_data->t_infomask2); - xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + xlrec.flags = XLH_LOCK_STORE_CID | (cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0); xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data); XLogRegisterData((char *) &xlrec, SizeOfHeapLock); @@ -6090,7 +6091,7 @@ heap_abort_speculative(Relation relation, ItemPointer tid) xl_heap_delete xlrec; XLogRecPtr recptr; - xlrec.flags = XLH_DELETE_IS_SUPER; + xlrec.flags = XLH_DELETE_STORE_CID|XLH_DELETE_IS_SUPER; xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); @@ -8457,7 +8458,7 @@ log_heap_update(Relation reln, Buffer oldbuf, } /* Prepare main WAL data chain */ - xlrec.flags = 0; + xlrec.flags = XLH_UPDATE_STORE_CID; if (all_visible_cleared) xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED; if (new_all_visible_cleared) @@ -9177,6 +9178,12 @@ heap_xlog_delete(XLogReaderState *record) RelFileNode target_node; ItemPointerData target_tid; + if ((heap_xlog_store_cid || (xlrec->flags & XLH_DELETE_STORE_CID)) ? 4 : 0) + { + int cid_offs = offsetof(xl_heap_delete, t_cid); + memcpy((char*)xlrec + offsetof(xl_heap_delete, offnum) + 2, (char*)xlrec + cid_offs + 4, SizeOfHeapDelete - cid_offs - 4); + } + XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); ItemPointerSetBlockNumber(&target_tid, blkno); ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum); @@ -9294,6 +9301,7 @@ heap_xlog_insert(XLogReaderState *record) { Size datalen; char *data; + Size hdrsize; page = BufferGetPage(buffer); @@ -9302,10 +9310,13 @@ heap_xlog_insert(XLogReaderState *record) data = XLogRecGetBlockData(record, 0, &datalen); - newlen = datalen - SizeOfHeapHeader; - Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize); - memcpy((char *) &xlhdr, data, SizeOfHeapHeader); - data += SizeOfHeapHeader; + hdrsize = SizeOfHeapHeader - ((heap_xlog_store_cid || (xlrec->flags & XLH_INSERT_STORE_CID)) ? 4 : 0); + + newlen = datalen - hdrsize; + Assert(datalen > hdrsize && newlen <= MaxHeapTupleSize); + memcpy((char *) &xlhdr, data, hdrsize); + xlhdr.t_hoff = *((uint8*)&xlhdr + hdrsize - 1); + data += hdrsize; htup = &tbuf.hdr; MemSet((char *) htup, 0, SizeofHeapTupleHeader); @@ -9430,6 +9441,7 @@ heap_xlog_multi_insert(XLogReaderState *record) { OffsetNumber offnum; xl_multi_insert_tuple *xlhdr; + Size tupsize; /* * If we're reinitializing the page, the tuples are stored in @@ -9443,8 +9455,10 @@ heap_xlog_multi_insert(XLogReaderState *record) if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "invalid max offset number"); + tupsize = SizeOfMultiInsertTuple - ((heap_xlog_store_cid || (xlrec->flags & XLH_INSERT_STORE_CID)) ? 6 /* alignment! */ : 0); xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata); - tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple; + xlhdr->t_hoff = *((uint8*)xlhdr + tupsize - 1); + tupdata = ((char *) xlhdr) + tupsize; newlen = xlhdr->datalen; Assert(newlen <= MaxHeapTupleSize); @@ -9534,6 +9548,11 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) XLogRedoAction oldaction; XLogRedoAction newaction; + if ((heap_xlog_store_cid || (xlrec->flags & XLH_UPDATE_STORE_CID)) ? 4 : 0) + { + int cid_offs = offsetof(xl_heap_update, t_cid); + memcpy((char*)xlrec + cid_offs, (char*)xlrec + cid_offs + 4, SizeOfHeapUpdate - cid_offs - 4); + } /* initialize to keep the compiler quiet */ oldtup.t_data = NULL; oldtup.t_len = 0; @@ -9655,6 +9674,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) char *recdata_end; Size datalen; Size tuplen; + Size hdrsize; recdata = XLogRecGetBlockData(record, 0, &datalen); recdata_end = recdata + datalen; @@ -9678,8 +9698,11 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) recdata += sizeof(uint16); } - memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader); - recdata += SizeOfHeapHeader; + hdrsize = SizeOfHeapHeader - ((heap_xlog_store_cid || (xlrec->flags & XLH_UPDATE_STORE_CID)) ? 4 : 0); + + memcpy((char *) &xlhdr, recdata, hdrsize); + xlhdr.t_hoff = *((uint8*)&xlhdr + hdrsize - 1); + recdata += hdrsize; tuplen = recdata_end - recdata; Assert(tuplen <= MaxHeapTupleSize); @@ -9823,6 +9846,12 @@ heap_xlog_lock(XLogReaderState *record) ItemId lp = NULL; HeapTupleHeader htup; + if ((heap_xlog_store_cid || (xlrec->flags & XLH_LOCK_STORE_CID)) ? 4 : 0) + { + int cid_offs = offsetof(xl_heap_lock, t_cid); + memcpy((char*)xlrec + offsetof(xl_heap_lock, offnum) + 2, (char*)xlrec + cid_offs + 4, SizeOfHeapLock - cid_offs - 4); + } + /* * The visibility map may need to be fixed even if the heap page is * already up-to-date. diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 3cdd4c8a75c..8caabe54a4f 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -22,6 +22,12 @@ #include "storage/relfilenode.h" #include "utils/relcache.h" +/* + * NEON: forcing Neon format of insert/update/delete WAL records, stopring information about CID. + * It is also indicating by correspendent bit XLH_*_STORE_CID in flags field, + * but old versions of Neon didn't set it. This is why we need this GUC. + */ +extern bool heap_xlog_store_cid; /* * WAL record definitions for heapam.c's WAL operations @@ -72,6 +78,12 @@ /* all_frozen_set always implies all_visible_set */ #define XLH_INSERT_ALL_FROZEN_SET (1<<5) +/* NEON: use Neon WAL record format extension: store T_CID */ +#define XLH_INSERT_STORE_CID (1<<7) +#define XLH_UPDATE_STORE_CID (1<<7) +#define XLH_DELETE_STORE_CID (1<<7) +#define XLH_LOCK_STORE_CID (1<<7) + /* * xl_heap_update flag values, 8 bits are available. */ From f6ea27beb8ad640ce7c78053803914eab914023a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 14 Aug 2023 19:06:48 +0300 Subject: [PATCH 55/56] Make handling t_cid more clear and less erro prone and move t_cid from xl_multi_insert)_tuple to xl_multi_insert --- src/backend/access/heap/heapam.c | 81 +++++++++++++++++++++----------- src/include/access/heapam_xlog.h | 22 +++++++-- 2 files changed, 73 insertions(+), 30 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 632dc3a7f40..0946b629a61 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -112,8 +112,25 @@ static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_required, bool *copy); +/* + * NEON: this field is needed to provide backward compatibiity with Neon log, + * where t_cid was already added to insert/delete/update/lock heap WAL records, + * but XLH_*_STORE_CID is not yet set. + */ bool heap_xlog_store_cid; +/* + * NEON: this macro is used to "add" new field (t_cid) to correspondent Vanilla struct + * which doesn't have it + */ +#define StructAddField(ptr, struct_name, field_before, new_field, field_after, unaligned_size, init_value) \ + do { \ + memmove(&(ptr)->field_after, \ + &(ptr)->field_before + 1, \ + unaligned_size - offsetof(struct_name, field_after)); \ + (ptr)->new_field = init_value; \ + } while (0) + /* * Each tuple lock mode has a corresponding heavyweight lock, and one or two * corresponding MultiXactStatuses (one to merely lock tuples, another one to @@ -2586,6 +2603,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, xlrec->flags = XLH_INSERT_ALL_FROZEN_SET; xlrec->ntuples = nthispage; + xlrec->t_cid = cid; /* * Write out an xl_multi_insert_tuple and the tuple data itself @@ -2605,7 +2623,6 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; tuphdr->t_infomask = heaptup->t_data->t_infomask; - tuphdr->t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); tuphdr->t_hoff = heaptup->t_data->t_hoff; /* write bitmap [+ padding] [+ oid] + data */ @@ -9178,11 +9195,8 @@ heap_xlog_delete(XLogReaderState *record) RelFileNode target_node; ItemPointerData target_tid; - if ((heap_xlog_store_cid || (xlrec->flags & XLH_DELETE_STORE_CID)) ? 4 : 0) - { - int cid_offs = offsetof(xl_heap_delete, t_cid); - memcpy((char*)xlrec + offsetof(xl_heap_delete, offnum) + 2, (char*)xlrec + cid_offs + 4, SizeOfHeapDelete - cid_offs - 4); - } + if (!(heap_xlog_store_cid || (xlrec->flags & XLH_DELETE_STORE_CID))) + StructAddField(xlrec, xl_heap_delete, offnum, t_cid, infobits_set, SizeOfHeapDelete, FirstCommandId); XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno); ItemPointerSetBlockNumber(&target_tid, blkno); @@ -9310,12 +9324,19 @@ heap_xlog_insert(XLogReaderState *record) data = XLogRecGetBlockData(record, 0, &datalen); - hdrsize = SizeOfHeapHeader - ((heap_xlog_store_cid || (xlrec->flags & XLH_INSERT_STORE_CID)) ? 4 : 0); - + if (heap_xlog_store_cid || (xlrec->flags & XLH_INSERT_STORE_CID)) + { + hdrsize = SizeOfHeapHeader; + memcpy((char *) &xlhdr, data, hdrsize); + } + else + { + hdrsize = SizeOfHeapHeaderWithoutCid; + memcpy((char *) &xlhdr, data, hdrsize); + StructAddField(&xlhdr, xl_heap_header, t_infomask, t_cid, t_hoff, SizeOfHeapHeader, FirstCommandId); + } newlen = datalen - hdrsize; Assert(datalen > hdrsize && newlen <= MaxHeapTupleSize); - memcpy((char *) &xlhdr, data, hdrsize); - xlhdr.t_hoff = *((uint8*)&xlhdr + hdrsize - 1); data += hdrsize; htup = &tbuf.hdr; @@ -9395,6 +9416,9 @@ heap_xlog_multi_insert(XLogReaderState *record) */ xlrec = (xl_heap_multi_insert *) XLogRecGetData(record); + if (!(xlrec->flags & XLH_INSERT_STORE_CID)) + StructAddField(xlrec, xl_heap_multi_insert, ntuples, t_cid, offsets, SizeOfHeapMultiInsert, FirstCommandId); + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); /* check that the mutually exclusive flags are not both set */ @@ -9455,9 +9479,8 @@ heap_xlog_multi_insert(XLogReaderState *record) if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "invalid max offset number"); - tupsize = SizeOfMultiInsertTuple - ((heap_xlog_store_cid || (xlrec->flags & XLH_INSERT_STORE_CID)) ? 6 /* alignment! */ : 0); + tupsize = heap_xlog_store_cid ? SizeOfMultiInsertTupleWithCid : SizeOfMultiInsertTuple; xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata); - xlhdr->t_hoff = *((uint8*)xlhdr + tupsize - 1); tupdata = ((char *) xlhdr) + tupsize; newlen = xlhdr->datalen; @@ -9475,7 +9498,9 @@ heap_xlog_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, xlhdr->t_cid); + HeapTupleHeaderSetCmin(htup, heap_xlog_store_cid + ? ((xl_multi_insert_tuple_with_cid*)xlhdr)->t_cid + : xlrec->t_cid); ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); @@ -9548,11 +9573,9 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) XLogRedoAction oldaction; XLogRedoAction newaction; - if ((heap_xlog_store_cid || (xlrec->flags & XLH_UPDATE_STORE_CID)) ? 4 : 0) - { - int cid_offs = offsetof(xl_heap_update, t_cid); - memcpy((char*)xlrec + cid_offs, (char*)xlrec + cid_offs + 4, SizeOfHeapUpdate - cid_offs - 4); - } + if (!(heap_xlog_store_cid || (xlrec->flags & XLH_UPDATE_STORE_CID))) + StructAddField(xlrec, xl_heap_update, flags, t_cid, new_xmax, SizeOfHeapUpdate, FirstCommandId); + /* initialize to keep the compiler quiet */ oldtup.t_data = NULL; oldtup.t_len = 0; @@ -9698,10 +9721,17 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) recdata += sizeof(uint16); } - hdrsize = SizeOfHeapHeader - ((heap_xlog_store_cid || (xlrec->flags & XLH_UPDATE_STORE_CID)) ? 4 : 0); - - memcpy((char *) &xlhdr, recdata, hdrsize); - xlhdr.t_hoff = *((uint8*)&xlhdr + hdrsize - 1); + if (heap_xlog_store_cid || (xlrec->flags & XLH_UPDATE_STORE_CID)) + { + hdrsize = SizeOfHeapHeader; + memcpy((char *) &xlhdr, recdata, hdrsize); + } + else + { + hdrsize = SizeOfHeapHeaderWithoutCid; + memcpy((char *) &xlhdr, recdata, hdrsize); + StructAddField(&xlhdr, xl_heap_header, t_infomask, t_cid, t_hoff, SizeOfHeapHeader, FirstCommandId); + } recdata += hdrsize; tuplen = recdata_end - recdata; @@ -9846,11 +9876,8 @@ heap_xlog_lock(XLogReaderState *record) ItemId lp = NULL; HeapTupleHeader htup; - if ((heap_xlog_store_cid || (xlrec->flags & XLH_LOCK_STORE_CID)) ? 4 : 0) - { - int cid_offs = offsetof(xl_heap_lock, t_cid); - memcpy((char*)xlrec + offsetof(xl_heap_lock, offnum) + 2, (char*)xlrec + cid_offs + 4, SizeOfHeapLock - cid_offs - 4); - } + if (!(heap_xlog_store_cid || (xlrec->flags & XLH_LOCK_STORE_CID))) + StructAddField(xlrec, xl_heap_lock, offnum, t_cid, infobits_set, SizeOfHeapLock, FirstCommandId); /* * The visibility map may need to be fixed even if the heap page is diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 8caabe54a4f..2040b60bd8d 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -23,9 +23,9 @@ #include "utils/relcache.h" /* - * NEON: forcing Neon format of insert/update/delete WAL records, stopring information about CID. + * NEON: forcing Neon format of insert/update/delete WAL records, storing information about CID. * It is also indicating by correspendent bit XLH_*_STORE_CID in flags field, - * but old versions of Neon didn't set it. This is why we need this GUC. + * but old versions of Neon didn't set it. This is why we need this flag which is set by neon extension. */ extern bool heap_xlog_store_cid; @@ -163,6 +163,7 @@ typedef struct xl_heap_header } xl_heap_header; #define SizeOfHeapHeader (offsetof(xl_heap_header, t_hoff) + sizeof(uint8)) +#define SizeOfHeapHeaderWithoutCid (offsetof(xl_heap_header, t_hoff) + sizeof(uint8) - sizeof(uint32)) /* This is what we need to know about insert */ typedef struct xl_heap_insert @@ -190,12 +191,16 @@ typedef struct xl_heap_multi_insert { uint8 flags; uint16 ntuples; + uint32 t_cid; OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; } xl_heap_multi_insert; #define SizeOfHeapMultiInsert offsetof(xl_heap_multi_insert, offsets) -typedef struct xl_multi_insert_tuple +/* NEON: provide backward compatibility. For some historical reasons in Neon t_cid was added not + * to xl_heap_multi_insert, but to xl_multi_insert_tuple. + */ +typedef struct xl_multi_insert_tuple_with_cid { uint16 datalen; /* size of tuple data that follows */ uint16 t_infomask2; @@ -203,6 +208,17 @@ typedef struct xl_multi_insert_tuple uint32 t_cid; uint8 t_hoff; /* TUPLE DATA FOLLOWS AT END OF STRUCT */ +} xl_multi_insert_tuple_with_cid; + +#define SizeOfMultiInsertTupleWithCid (offsetof(xl_multi_insert_tuple_with_cid, t_hoff) + sizeof(uint8)) + +typedef struct xl_multi_insert_tuple +{ + uint16 datalen; /* size of tuple data that follows */ + uint16 t_infomask2; + uint16 t_infomask; + uint8 t_hoff; + /* TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_multi_insert_tuple; #define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8)) From 2c76abf4d54b4d9e7ef5f4a86184f15747fb7138 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 15 Aug 2023 18:28:18 +0300 Subject: [PATCH 56/56] Support work with both multi_insert record formats in compatibility mode --- src/backend/access/heap/heapam.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 0946b629a61..8cb68674156 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -9479,8 +9479,14 @@ heap_xlog_multi_insert(XLogReaderState *record) if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "invalid max offset number"); - tupsize = heap_xlog_store_cid ? SizeOfMultiInsertTupleWithCid : SizeOfMultiInsertTuple; xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata); + if (heap_xlog_store_cid && !(xlrec->flags & XLH_INSERT_STORE_CID)) { + /* Originally Neon stored CID in xl_multi_insert_tuple but then it was moved to xl_multi_insert */ + tupsize = SizeOfMultiInsertTupleWithCid; + xlrec->t_cid = ((xl_multi_insert_tuple_with_cid*)xlhdr)->t_cid; + } else { + tupsize = SizeOfMultiInsertTuple; + } tupdata = ((char *) xlhdr) + tupsize; newlen = xlhdr->datalen; @@ -9498,9 +9504,7 @@ heap_xlog_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, heap_xlog_store_cid - ? ((xl_multi_insert_tuple_with_cid*)xlhdr)->t_cid - : xlrec->t_cid); + HeapTupleHeaderSetCmin(htup, xlrec->t_cid); ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);