diff --git a/configure b/configure index cbc1ebfc613..e3d534d98af 100755 --- a/configure +++ b/configure @@ -711,6 +711,7 @@ with_libxml with_uuid with_readline with_systemd +with_libseccomp with_selinux with_ldap with_krb_srvnam @@ -861,6 +862,7 @@ with_bsd_auth with_ldap with_bonjour with_selinux +with_libseccomp with_systemd with_readline with_libedit_preferred @@ -1571,6 +1573,7 @@ Optional Packages: --with-ldap build with LDAP support --with-bonjour build with Bonjour support --with-selinux build with SELinux support + --with-libseccomp build with libseccomp support --with-systemd build with systemd support --without-readline do not use GNU Readline nor BSD Libedit for editing --with-libedit-preferred @@ -8868,6 +8871,39 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_selinux" >&5 $as_echo "$with_selinux" >&6; } +# +# libseccomp +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libseccomp support" >&5 +$as_echo_n "checking whether to build with libseccomp support... " >&6; } + + + +# Check whether --with-libseccomp was given. +if test "${with_libseccomp+set}" = set; then : + withval=$with_libseccomp; + case $withval in + yes) + : + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --with-libseccomp option" "$LINENO" 5 + ;; + esac + +else + with_libseccomp=no + +fi + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libseccomp" >&5 +$as_echo "$with_libseccomp" >&6; } + # # Systemd # @@ -14350,6 +14386,56 @@ else fi +fi + +if test "$with_libseccomp" = yes ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for seccomp_init in -lseccomp" >&5 +$as_echo_n "checking for seccomp_init in -lseccomp... " >&6; } +if ${ac_cv_lib_seccomp_seccomp_init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lseccomp $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char seccomp_init (); +int +main () +{ +return seccomp_init (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_seccomp_seccomp_init=yes +else + ac_cv_lib_seccomp_seccomp_init=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_seccomp_seccomp_init" >&5 +$as_echo "$ac_cv_lib_seccomp_seccomp_init" >&6; } +if test "x$ac_cv_lib_seccomp_seccomp_init" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBSECCOMP 1 +_ACEOF + + LIBS="-lseccomp $LIBS" + +else + as_fn_error $? "library 'libseccomp' is required for Seccomp BPF support" "$LINENO" 5 +fi + fi # for contrib/uuid-ossp diff --git a/configure.ac b/configure.ac index 69b2bbb5763..d3e0042791c 100644 --- a/configure.ac +++ b/configure.ac @@ -972,6 +972,14 @@ PGAC_ARG_BOOL(with, selinux, no, [build with SELinux support]) AC_SUBST(with_selinux) AC_MSG_RESULT([$with_selinux]) +# +# libseccomp +# +AC_MSG_CHECKING([whether to build with libseccomp support]) +PGAC_ARG_BOOL(with, libseccomp, no, [build with libseccomp support]) +AC_SUBST(with_libseccomp) +AC_MSG_RESULT([$with_libseccomp]) + # # Systemd # @@ -1624,6 +1632,11 @@ dnl If you want to use Apple's own Bonjour code on another platform, dnl just add -ldns_sd to LIBS manually. fi +if test "$with_libseccomp" = yes ; then + AC_CHECK_LIB(seccomp, seccomp_init, [], + [AC_MSG_ERROR([library 'libseccomp' is required for Seccomp BPF support])]) +fi + # for contrib/uuid-ossp if test "$with_uuid" = bsd ; then AC_CHECK_HEADERS(uuid.h, diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index 93835449c0e..64cf5531838 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -54,6 +54,8 @@ #include "utils/rel.h" #include "utils/relfilenumbermap.h" #include "utils/resowner.h" +#include "utils/spccache.h" + #define AUTOPREWARM_FILE "autoprewarm.blocks" @@ -448,10 +450,12 @@ void autoprewarm_database_main(Datum main_arg) { int pos; + int io_concurrency; BlockInfoRecord *block_info; Relation rel = NULL; BlockNumber nblocks = 0; BlockInfoRecord *old_blk = NULL; + BlockInfoRecord *prefetch_blk = NULL; dsm_segment *seg; /* Establish signal handlers; once that's done, unblock signals. */ @@ -498,6 +502,7 @@ autoprewarm_database_main(Datum main_arg) { relation_close(rel, AccessShareLock); rel = NULL; + io_concurrency = -1; CommitTransactionCommand(); } @@ -517,6 +522,8 @@ autoprewarm_database_main(Datum main_arg) if (!rel) CommitTransactionCommand(); + else + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); } if (!rel) { @@ -549,6 +556,35 @@ autoprewarm_database_main(Datum main_arg) continue; } + /* if prefetching is enabled for this relation */ + if (io_concurrency > 0) + { + /* make prefetch_blk catch up */ + if (blk > prefetch_blk) + { + prefetch_blk = blk; + } + + /* now, prefetch all following blocks */ + while (prefetch_blk <= &block_info[apw_state->prewarm_stop_idx]) + { + /* unless they're of a different relfilenode */ + if (prefetch_blk->filenumber != blk->filenumber || + prefetch_blk->forknum != blk->forknum || + prefetch_blk->blocknum >= nblocks) + break; + + /* or unless they are more than io_concurrency blocks ahead */ + if (blk + io_concurrency <= prefetch_blk) + break; + + PrefetchBuffer(rel, prefetch_blk->forknum, prefetch_blk->blocknum); + + /* continue with the next block */ + prefetch_blk++; + } + } + /* Prewarm buffer. */ buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL, NULL); diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index e464d0d4d2b..fd139b2cd73 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -18,12 +18,14 @@ #include "access/relation.h" #include "fmgr.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "storage/bufmgr.h" #include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/spccache.h" PG_MODULE_MAGIC; @@ -183,14 +185,26 @@ pg_prewarm(PG_FUNCTION_ARGS) } else if (ptype == PREWARM_BUFFER) { + BlockNumber prefetch_block = first_block; + Oid nspOid; + int io_concurrency; + + nspOid = rel->rd_rel->reltablespace; + io_concurrency = get_tablespace_maintenance_io_concurrency(nspOid); + /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { - Buffer buf; - + Buffer buf; + BlockNumber prefetch_stop = block + Min(last_block - block + 1, + io_concurrency); CHECK_FOR_INTERRUPTS(); + while (prefetch_block < prefetch_stop) + { + PrefetchBuffer(rel, forkNumber, prefetch_block++); + } buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control index 40e3add4810..d40d1a000b7 100644 --- a/contrib/pg_prewarm/pg_prewarm.control +++ b/contrib/pg_prewarm/pg_prewarm.control @@ -3,3 +3,4 @@ comment = 'prewarm relation data' default_version = '1.2' module_pathname = '$libdir/pg_prewarm' relocatable = true +trusted = true diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c index 796a74f322b..141018cfd2f 100644 --- a/contrib/pg_walinspect/pg_walinspect.c +++ b/contrib/pg_walinspect/pg_walinspect.c @@ -271,7 +271,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record, { DecodedBkpBlock *blk; BlockNumber blkno; - RelFileLocator rnode; + RelFileLocator rlocator; ForkNumber forknum; Datum values[PG_GET_WAL_BLOCK_INFO_COLS] = {0}; bool nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0}; @@ -286,7 +286,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record, blk = XLogRecGetBlock(record, block_id); (void) XLogRecGetBlockTagExtended(record, block_id, - &rnode, &forknum, &blkno, NULL); + &rlocator, &forknum, &blkno, NULL); /* Save block_data_len */ if (blk->has_data) diff --git a/src/Makefile.global.in b/src/Makefile.global.in index cc4dc6de91e..3db93d127fe 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -186,6 +186,7 @@ with_tcl = @with_tcl@ with_ssl = @with_ssl@ with_readline = @with_readline@ with_selinux = @with_selinux@ +with_libseccomp = @with_libseccomp@ with_systemd = @with_systemd@ with_gssapi = @with_gssapi@ with_krb_srvnam = @with_krb_srvnam@ diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index 89145b68f68..188045d224a 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -69,7 +69,8 @@ brin_xlog_insert_update(XLogReaderState *record, } /* need this page's blkno to store in revmap */ - regpgno = BufferGetBlockNumber(buffer); + //ZENITH XXX Don't use BufferGetBlockNumber because wal-redo doesn't pin buffer. + XLogRecGetBlockTag(record, 0, NULL, NULL, ®pgno); /* insert the index item into the page */ if (action == BLK_NEEDS_REDO) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 56968b95acf..4d33e986c40 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -335,6 +335,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + smgr_start_unlogged_build(index->rd_smgr); + initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); @@ -408,6 +410,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats, true); + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if WAL-logging is * required, write all pages to the WAL now. @@ -417,8 +421,12 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM); } + smgr_end_unlogged_build(index->rd_smgr); + /* * Return statistics */ diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index f7c84beef8d..3cac0957fef 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -407,6 +407,7 @@ ginRedoSplit(XLogReaderState *record) rootbuf; bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0; + XLogRedoAction action; /* * First clear incomplete-split flag on child page if this finishes a @@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record) if (!isLeaf) ginRedoClearIncompleteSplit(record, 3); - if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 0, &lbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of left page"); - if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 1, &rbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of right page"); if (isRoot) { - if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 2, &rootbuf); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of root page"); - UnlockReleaseBuffer(rootbuf); + if (rootbuf != InvalidBuffer) + UnlockReleaseBuffer(rootbuf); } - UnlockReleaseBuffer(rbuffer); - UnlockReleaseBuffer(lbuffer); + if (rbuffer != InvalidBuffer) + UnlockReleaseBuffer(rbuffer); + if (lbuffer != InvalidBuffer) + UnlockReleaseBuffer(lbuffer); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 5e0c1447f92..b60cdbb627e 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -40,6 +40,7 @@ #include "access/tableam.h" #include "access/xloginsert.h" #include "catalog/index.h" +#include "catalog/storage.h" #include "miscadmin.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" @@ -297,6 +298,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) Buffer buffer; Page page; + smgr_start_unlogged_build(index->rd_smgr); + /* initialize the root page */ buffer = gistNewBuffer(index, heap); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); @@ -329,6 +332,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) gistFreeBuildBuffers(buildstate.gfbb); } + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if * WAL-logging is required, write all pages to the WAL now. @@ -338,7 +343,13 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, + index->rd_smgr->smgr_rlocator.locator, + MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM); } + + smgr_end_unlogged_build(index->rd_smgr); } /* okay, all heap tuples are indexed */ @@ -463,8 +474,16 @@ gist_indexsortbuild(GISTBuildState *state) smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, levelstate->pages[0], true); if (RelationNeedsWAL(state->indexrel)) - log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); + { + XLogRecPtr lsn; + + lsn = log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO, + levelstate->pages[0], true); + + SetLastWrittenLSNForBlock(lsn, state->indexrel->rd_smgr->smgr_rlocator.locator, + MAIN_FORKNUM, GIST_ROOT_BLKNO); + SetLastWrittenLSNForRelation(lsn, state->indexrel->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM); + } pfree(levelstate->pages[0]); pfree(levelstate); diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 3f60d3274d2..5e1bc1ee193 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -23,6 +23,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "utils/memutils.h" +#include "utils/spccache.h" /* Working state needed by gistbulkdelete */ typedef struct @@ -130,8 +131,14 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BlockNumber num_pages; bool needLock; BlockNumber blkno; + BlockNumber prefetch_blkno; + int io_concurrency; MemoryContext oldctx; + io_concurrency = get_tablespace_maintenance_io_concurrency( + rel->rd_rel->reltablespace + ); + /* * Reset fields that track information about the entire index now. This * avoids double-counting in the case where a single VACUUM command @@ -209,6 +216,7 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, needLock = !RELATION_IS_LOCAL(rel); blkno = GIST_ROOT_BLKNO; + prefetch_blkno = blkno; for (;;) { /* Get the current relation length */ @@ -221,9 +229,21 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; + + if (prefetch_blkno < blkno) + prefetch_blkno = blkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < blkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) + { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++); + gistvacuumpage(&vstate, blkno, blkno); + } } /* diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index fc5d97f606e..35fe72f6dbc 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -32,6 +32,7 @@ #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" +#include "utils/spccache.h" /* Working state for hashbuild and its callback */ typedef struct @@ -467,13 +468,17 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; + Bucket prf_bucket; Buffer metabuf = InvalidBuffer; HashMetaPage metap; HashMetaPage cachedmetap; + int io_concurrency; tuples_removed = 0; num_index_tuples = 0; + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + /* * We need a copy of the metapage so that we can use its hashm_spares[] * values to compute bucket page addresses, but a cached copy should be @@ -488,9 +493,14 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Scan the buckets that we know exist */ cur_bucket = 0; + prf_bucket = cur_bucket; cur_maxbucket = orig_maxbucket; loop_top: + for (; prf_bucket <= cur_maxbucket && + prf_bucket < cur_bucket + io_concurrency; prf_bucket++) + PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket)); + while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; @@ -501,6 +511,12 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Page page; bool split_cleanup = false; + if (io_concurrency > 0 && prf_bucket <= cur_maxbucket) + { + PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket)); + prf_bucket++; + } + /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 64f84a2e4bd..16fcb8f80d1 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -54,6 +54,7 @@ #include "catalog/catalog.h" #include "commands/vacuum.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "port/atomics.h" #include "port/pg_bitutils.h" @@ -71,6 +72,7 @@ #include "utils/relcache.h" #include "utils/snapmgr.h" #include "utils/spccache.h" +#include "access/neon_xlog.h" static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, @@ -320,6 +322,27 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) scan->rs_startblock = 0; } + if (enable_seqscan_prefetch) + { + /* + * Do not use tablespace setting for catalog scans, as we might have + * the tablespace settings in the catalogs locked already, which + * might result in a deadlock. + */ + if (IsCatalogRelation(scan->rs_base.rs_rd)) + scan->rs_prefetch_maximum = effective_io_concurrency; + else + scan->rs_prefetch_maximum = + get_tablespace_io_concurrency(scan->rs_base.rs_rd->rd_rel->reltablespace); + + scan->rs_prefetch_target = 1; + } + else + { + scan->rs_prefetch_maximum = -1; + scan->rs_prefetch_target = -1; + } + scan->rs_numblocks = InvalidBlockNumber; scan->rs_inited = false; scan->rs_ctup.t_data = NULL; @@ -401,6 +424,103 @@ heapgetpage(TableScanDesc sscan, BlockNumber block) */ CHECK_FOR_INTERRUPTS(); + /* Prefetch up to io_concurrency blocks ahead */ + if (scan->rs_prefetch_maximum > 0 && scan->rs_nblocks > 1) + { + int64 nblocks; + int64 rel_scan_start; + int64 rel_scan_end; /* blockno of end of scan (mod scan->rs_nblocks) */ + int64 scan_pageoff; /* page, but adjusted for scan position as above */ + + int64 prefetch_start; /* start block of prefetch requests this iteration */ + int64 prefetch_end; /* end block of prefetch requests this iteration, if applicable */ + ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata; + ParallelBlockTableScanDesc pbscandesc = (ParallelBlockTableScanDesc) sscan->rs_parallel; + + /* + * Parallel scans look like repeated sequential table scans for + * prefetching; with a scan start at nalloc + ch_remaining - ch_size + */ + if (pbscanwork != NULL) + { + uint64 start_offset, + end_offset; + + Assert(pbscandesc != NULL); + start_offset = pbscanwork->phsw_nallocated + + pbscanwork->phsw_chunk_remaining + 1 + - pbscanwork->phsw_chunk_size; + end_offset = Min(pbscanwork->phsw_nallocated + + pbscanwork->phsw_chunk_remaining + 1, + pbscandesc->phs_nblocks); + + rel_scan_start = (int64) (pbscandesc->phs_startblock) + start_offset; + rel_scan_end = (int64) (pbscandesc->phs_startblock) + end_offset; + nblocks = pbscandesc->phs_nblocks; + } + else + { + rel_scan_start = scan->rs_startblock; + rel_scan_end = scan->rs_startblock + scan->rs_nblocks; + nblocks = scan->rs_nblocks; + } + + prefetch_end = rel_scan_end; + + if ((uint64) block < rel_scan_start) + scan_pageoff = block + nblocks; + else + scan_pageoff = block; + + Assert(rel_scan_start <= scan_pageoff && scan_pageoff <= rel_scan_end); + + /* + * If this is the first page of this seqscan, initiate prefetch of + * pages page..page + n. On each subsequent call, prefetch the next + * page that we haven't prefetched yet, at page + n. + * If this is the last page of the prefetch, + */ + if (rel_scan_start != block) + { + prefetch_start = scan_pageoff + (int64) scan->rs_prefetch_target - 1; + prefetch_end = prefetch_start + 1; + } + else + { + prefetch_start = scan_pageoff; + prefetch_end = rel_scan_end; + } + + /* do not prefetch if the only page we're trying to prefetch is past the end of our scan window */ + if (prefetch_start > rel_scan_end) + prefetch_end = 0; + + if (prefetch_end > prefetch_start + scan->rs_prefetch_target) + prefetch_end = prefetch_start + scan->rs_prefetch_target; + + if (prefetch_end > rel_scan_end) + prefetch_end = rel_scan_end; + + while (prefetch_start < prefetch_end) + { + BlockNumber blckno = (prefetch_start % nblocks); + Assert(blckno < nblocks); + Assert(blckno < INT_MAX); + PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, blckno); + prefetch_start += 1; + } + + /* + * Use exponential growth of readahead up to prefetch_maximum, to + * make sure that a low LIMIT does not result in high IO overhead, + * but operations in general are still very fast. + */ + if (scan->rs_prefetch_target < scan->rs_prefetch_maximum / 2) + scan->rs_prefetch_target *= 2; + else if (scan->rs_prefetch_target < scan->rs_prefetch_maximum) + scan->rs_prefetch_target = scan->rs_prefetch_maximum; + } + /* read page using selected strategy */ scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, block, RBM_NORMAL, scan->rs_strategy); @@ -1912,11 +2032,11 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, /* XLOG stuff */ if (RelationNeedsWAL(relation)) { - xl_heap_insert xlrec; - xl_heap_header xlhdr; + xl_neon_heap_insert xlrec; + xl_neon_heap_header xlhdr; XLogRecPtr recptr; Page page = BufferGetPage(buffer); - uint8 info = XLOG_HEAP_INSERT; + uint8 info = XLOG_NEON_HEAP_INSERT; int bufflags = 0; /* @@ -1934,7 +2054,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber && PageGetMaxOffsetNumber(page) == FirstOffsetNumber) { - info |= XLOG_HEAP_INIT_PAGE; + info |= XLOG_NEON_INIT_PAGE; bufflags |= REGBUF_WILL_INIT; } @@ -1962,11 +2082,12 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, } XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapInsert); + XLogRegisterData((char *) &xlrec, SizeOfNeonHeapInsert); xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; xlhdr.t_infomask = heaptup->t_data->t_infomask; xlhdr.t_hoff = heaptup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); /* * note we mark xlhdr as belonging to buffer; if XLogInsert decides to @@ -1974,7 +2095,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, * xl_heap_header in the xlog. */ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); - XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); + XLogRegisterBufData(0, (char *) &xlhdr, SizeOfNeonHeapHeader); /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ XLogRegisterBufData(0, (char *) heaptup->t_data + SizeofHeapTupleHeader, @@ -1983,14 +2104,25 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP_ID, info); + recptr = XLogInsert(RM_NEON_ID, info); PageSetLSN(page, recptr); } END_CRIT_SECTION(); - UnlockReleaseBuffer(buffer); + if (options & HEAP_INSERT_SPECULATIVE) + { + /* + * NEON: speculative token is not stored in WAL, so if the page is evicted + * from the buffer cache, the token will be lost. To prevent that, we keep the + * buffer pinned. It will be unpinned in heapam_tuple_finish/abort_speculative. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + else + UnlockReleaseBuffer(buffer); + if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); @@ -2274,8 +2406,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (needwal) { XLogRecPtr recptr; - xl_heap_multi_insert *xlrec; - uint8 info = XLOG_HEAP2_MULTI_INSERT; + xl_neon_heap_multi_insert *xlrec; + uint8 info = XLOG_NEON_HEAP_MULTI_INSERT; char *tupledata; int totaldatalen; char *scratchptr = scratch.data; @@ -2288,9 +2420,9 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, */ init = starting_with_empty_page; - /* allocate xl_heap_multi_insert struct from the scratch area */ - xlrec = (xl_heap_multi_insert *) scratchptr; - scratchptr += SizeOfHeapMultiInsert; + /* allocate xl_neon_heap_multi_insert struct from the scratch area */ + xlrec = (xl_neon_heap_multi_insert *) scratchptr; + scratchptr += SizeOfNeonHeapMultiInsert; /* * Allocate offsets array. Unless we're reinitializing the page, @@ -2322,17 +2454,25 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, for (i = 0; i < nthispage; i++) { HeapTuple heaptup = heaptuples[ndone + i]; - xl_multi_insert_tuple *tuphdr; + xl_neon_multi_insert_tuple *tuphdr; int datalen; if (!init) xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); /* xl_multi_insert_tuple needs two-byte alignment. */ - tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); - scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; + tuphdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(scratchptr); + scratchptr = ((char *) tuphdr) + SizeOfNeonMultiInsertTuple; tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; tuphdr->t_infomask = heaptup->t_data->t_infomask; + if (i == 0) + { + xlrec->t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); + } + else + { + Assert(xlrec->t_cid == HeapTupleHeaderGetRawCommandId(heaptup->t_data)); + } tuphdr->t_hoff = heaptup->t_data->t_hoff; /* write bitmap [+ padding] [+ oid] + data */ @@ -2359,7 +2499,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, if (init) { - info |= XLOG_HEAP_INIT_PAGE; + info |= XLOG_NEON_INIT_PAGE; bufflags |= REGBUF_WILL_INIT; } @@ -2379,7 +2519,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP2_ID, info); + recptr = XLogInsert(RM_NEON_ID, info); PageSetLSN(page, recptr); } @@ -2831,8 +2971,8 @@ heap_delete(Relation relation, ItemPointer tid, */ if (RelationNeedsWAL(relation)) { - xl_heap_delete xlrec; - xl_heap_header xlhdr; + xl_neon_heap_delete xlrec; + xl_neon_heap_header xlhdr; XLogRecPtr recptr; /* @@ -2851,6 +2991,7 @@ heap_delete(Relation relation, ItemPointer tid, tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = new_xmax; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); if (old_key_tuple != NULL) { @@ -2861,7 +3002,7 @@ heap_delete(Relation relation, ItemPointer tid, } XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + XLogRegisterData((char *) &xlrec, SizeOfNeonHeapDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); @@ -2872,9 +3013,10 @@ heap_delete(Relation relation, ItemPointer tid, { xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; - XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader); + XLogRegisterData((char *) &xlhdr, SizeOfNeonHeapHeader); XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader, old_key_tuple->t_len @@ -2884,7 +3026,7 @@ heap_delete(Relation relation, ItemPointer tid, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_DELETE); PageSetLSN(page, recptr); } @@ -3579,7 +3721,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, if (RelationNeedsWAL(relation)) { - xl_heap_lock xlrec; + xl_neon_heap_lock xlrec; XLogRecPtr recptr; XLogBeginInsert(); @@ -3591,8 +3733,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, oldtup.t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - XLogRegisterData((char *) &xlrec, SizeOfHeapLock); - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data); + XLogRegisterData((char *) &xlrec, SizeOfNeonHeapLock); + recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_LOCK); PageSetLSN(page, recptr); } @@ -4789,7 +4932,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, */ if (RelationNeedsWAL(relation)) { - xl_heap_lock xlrec; + xl_neon_heap_lock xlrec; XLogRecPtr recptr; XLogBeginInsert(); @@ -4800,11 +4943,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, xlrec.infobits_set = compute_infobits(new_infomask, tuple->t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; - XLogRegisterData((char *) &xlrec, SizeOfHeapLock); + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data); + XLogRegisterData((char *) &xlrec, SizeOfNeonHeapLock); /* we don't decode row locks atm, so no need to log the origin */ - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); + recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_LOCK); PageSetLSN(page, recptr); } @@ -5716,6 +5860,7 @@ heap_finish_speculative(Relation relation, ItemPointer tid) END_CRIT_SECTION(); + ReleaseBuffer(buffer); /* NEON: release buffer pinned by heap_insert */ UnlockReleaseBuffer(buffer); } @@ -5788,6 +5933,16 @@ heap_abort_speculative(Relation relation, ItemPointer tid) elog(ERROR, "attempted to kill a non-speculative tuple"); Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + /* + * NEON: release buffer pinned by heap_insert + * + * This function is also used on the toast tuples of an aborted speculative + * insertion. For those, there is no token on the tuple, and we didn' t keep + * the pin. + */ + if (HeapTupleHeaderIsSpeculative(tp.t_data)) + ReleaseBuffer(buffer); + /* * No need to check for serializable conflicts here. There is never a * need for a combo CID, either. No need to extract replica identity, or @@ -5837,7 +5992,7 @@ heap_abort_speculative(Relation relation, ItemPointer tid) */ if (RelationNeedsWAL(relation)) { - xl_heap_delete xlrec; + xl_neon_heap_delete xlrec; XLogRecPtr recptr; xlrec.flags = XLH_DELETE_IS_SUPER; @@ -5845,14 +6000,15 @@ heap_abort_speculative(Relation relation, ItemPointer tid) tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = xid; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + XLogRegisterData((char *) &xlrec, SizeOfNeonHeapDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); /* No replica identity & replication origin logged */ - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_DELETE); PageSetLSN(page, recptr); } @@ -8379,9 +8535,9 @@ log_heap_update(Relation reln, Buffer oldbuf, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared) { - xl_heap_update xlrec; - xl_heap_header xlhdr; - xl_heap_header xlhdr_idx; + xl_neon_heap_update xlrec; + xl_neon_heap_header xlhdr; + xl_neon_heap_header xlhdr_idx; uint8 info; uint16 prefix_suffix[2]; uint16 prefixlen = 0, @@ -8398,9 +8554,9 @@ log_heap_update(Relation reln, Buffer oldbuf, XLogBeginInsert(); if (HeapTupleIsHeapOnly(newtup)) - info = XLOG_HEAP_HOT_UPDATE; + info = XLOG_NEON_HEAP_HOT_UPDATE; else - info = XLOG_HEAP_UPDATE; + info = XLOG_NEON_HEAP_UPDATE; /* * If the old and new tuple are on the same page, we only need to log the @@ -8480,7 +8636,7 @@ log_heap_update(Relation reln, Buffer oldbuf, if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && PageGetMaxOffsetNumber(page) == FirstOffsetNumber) { - info |= XLOG_HEAP_INIT_PAGE; + info |= XLOG_NEON_INIT_PAGE; init = true; } else @@ -8495,6 +8651,7 @@ log_heap_update(Relation reln, Buffer oldbuf, /* Prepare WAL data for the new page */ xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); bufflags = REGBUF_STANDARD; if (init) @@ -8506,7 +8663,7 @@ log_heap_update(Relation reln, Buffer oldbuf, if (oldbuf != newbuf) XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD); - XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate); + XLogRegisterData((char *) &xlrec, SizeOfNeonHeapUpdate); /* * Prepare WAL data for the new tuple. @@ -8532,6 +8689,7 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr.t_infomask2 = newtup->t_data->t_infomask2; xlhdr.t_infomask = newtup->t_data->t_infomask; xlhdr.t_hoff = newtup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); /* @@ -8539,7 +8697,7 @@ log_heap_update(Relation reln, Buffer oldbuf, * * The 'data' doesn't include the common prefix or suffix. */ - XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader); + XLogRegisterBufData(0, (char *) &xlhdr, SizeOfNeonHeapHeader); if (prefixlen == 0) { XLogRegisterBufData(0, @@ -8573,8 +8731,9 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; + xlhdr_idx.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); - XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader); + XLogRegisterData((char *) &xlhdr_idx, SizeOfNeonHeapHeader); /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader, @@ -8584,7 +8743,7 @@ log_heap_update(Relation reln, Buffer oldbuf, /* filtering by origin on a row level is much more efficient */ XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); - recptr = XLogInsert(RM_HEAP_ID, info); + recptr = XLogInsert(RM_NEON_ID, info); return recptr; } @@ -8982,7 +9141,16 @@ heap_xlog_visible(XLogReaderState *record) PageSetAllVisible(page); - if (XLogHintBitIsNeeded()) + /* + * NEON: despite to the comment above we need to update page LSN here. + * See discussion at hackers: https://www.postgresql.org/message-id/flat/039076d4f6cdd871691686361f83cb8a6913a86a.camel%40j-davis.com#101ba42b004f9988e3d54fce26fb3462 + * For Neon this assignment is critical because otherwise last written LSN tracked at compute doesn't + * match with page LSN assignee by WAL-redo and as a result, prefetched page is rejected. + * + * It is fixed in upstream in https://github.com/neondatabase/postgres/commit/7bf713dd2d0739fbcd4103971ed69c17ebe677ea + * but until it is merged we still need to carry a patch here. + */ + if (true || XLogHintBitIsNeeded()) PageSetLSN(page, lsn); MarkBufferDirty(buffer); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 5a17112c91e..f1fe3b33071 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -637,7 +637,7 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) { SMgrRelation dstrel; - dstrel = smgropen(*newrlocator, rel->rd_backend); + dstrel = smgropen(*newrlocator, rel->rd_backend, rel->rd_rel->relpersistence); /* * Since we copy the file directly without looking at the shared buffers, diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 424958912c7..ff025e07219 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -117,6 +117,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "replication/logical.h" +#include "replication/message.h" #include "replication/slot.h" #include "storage/bufmgr.h" #include "storage/fd.h" @@ -919,6 +920,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state) errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path, written, len))); src->off += len; + wallog_file_descriptor(src->path, FileGetRawDesc(src->vfd)); XLogBeginInsert(); XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); @@ -1004,7 +1006,7 @@ logical_rewrite_log_mapping(RewriteState state, TransactionId xid, src->off = 0; memcpy(src->path, path, sizeof(path)); src->vfd = PathNameOpenFile(path, - O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + O_CREAT | O_EXCL | O_RDWR | PG_BINARY); if (src->vfd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -1169,6 +1171,8 @@ heap_xlog_logical_rewrite(XLogReaderState *r) errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); + wallog_file_descriptor(path, fd); + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), @@ -1246,6 +1250,7 @@ CheckPointLogicalRewriteHeap(void) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); + wallog_file_descriptor(path, -1); } else { @@ -1274,6 +1279,8 @@ CheckPointLogicalRewriteHeap(void) errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); + wallog_file_descriptor(path, fd); + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 4eb953f9047..05f0d8b23eb 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -52,6 +52,7 @@ #include "commands/vacuum.h" #include "executor/instrument.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "optimizer/paths.h" #include "pgstat.h" #include "portability/instr_time.h" @@ -64,6 +65,7 @@ #include "utils/memutils.h" #include "utils/pg_rusage.h" #include "utils/timestamp.h" +#include "utils/spccache.h" /* @@ -145,6 +147,9 @@ typedef struct LVRelState Relation *indrels; int nindexes; + /* prefetch */ + int io_concurrency; + /* Buffer access strategy and parallel vacuum state */ BufferAccessStrategy bstrategy; ParallelVacuumState *pvs; @@ -366,6 +371,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, /* Set up high level stuff about rel and its indexes */ vacrel->rel = rel; + vacrel->io_concurrency = + get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes, &vacrel->indrels); vacrel->bstrategy = bstrategy; @@ -827,6 +834,7 @@ lazy_scan_heap(LVRelState *vacrel) BlockNumber rel_pages = vacrel->rel_pages, blkno, next_unskippable_block, + next_prefetch_block, next_fsm_block_to_vacuum = 0; VacDeadItems *dead_items = vacrel->dead_items; Buffer vmbuffer = InvalidBuffer; @@ -849,6 +857,7 @@ lazy_scan_heap(LVRelState *vacrel) next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0, &next_unskippable_allvis, &skipping_current_range); + next_prefetch_block = 0; for (blkno = 0; blkno < rel_pages; blkno++) { Buffer buf; @@ -941,6 +950,33 @@ lazy_scan_heap(LVRelState *vacrel) PROGRESS_VACUUM_PHASE_SCAN_HEAP); } + if (vacrel->io_concurrency > 0) + { + /* + * Prefetch io_concurrency blocks ahead + */ + uint32 prefetch_budget = vacrel->io_concurrency; + + /* never trail behind the current scan */ + if (next_prefetch_block < blkno) + next_prefetch_block = blkno; + + /* but only up to the end of the relation */ + if (prefetch_budget > rel_pages - next_prefetch_block) + prefetch_budget = rel_pages - next_prefetch_block; + + /* And only up to io_concurrency ahead of the current vacuum scan */ + if (next_prefetch_block + prefetch_budget > blkno + vacrel->io_concurrency) + prefetch_budget = blkno + vacrel->io_concurrency - next_prefetch_block; + + /* And only up to the next unskippable block */ + if (next_prefetch_block + prefetch_budget > next_unskippable_block) + prefetch_budget = next_unskippable_block - next_prefetch_block; + + for (; prefetch_budget-- > 0; next_prefetch_block++) + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, next_prefetch_block); + } + /* * Pin the visibility map page in case we need to mark the page * all-visible. In most cases this will be very cheap, because we'll @@ -2413,7 +2449,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) static void lazy_vacuum_heap_rel(LVRelState *vacrel) { - int index = 0; + int index = 0, + pindex = 0; BlockNumber vacuumed_pages = 0; Buffer vmbuffer = InvalidBuffer; LVSavedErrInfo saved_err_info; @@ -2443,6 +2480,47 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) blkno = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); vacrel->blkno = blkno; + if (vacrel->io_concurrency > 0) + { + /* + * If we're just starting out, prefetch N consecutive blocks. + * If not, only the next 1 block + */ + if (pindex == 0) + { + int prefetch_budget = Min(vacrel->dead_items->num_items, + Min(vacrel->rel_pages, + vacrel->io_concurrency)); + BlockNumber prev_prefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); + + while (++pindex < vacrel->dead_items->num_items && + prefetch_budget > 0) + { + ItemPointer ptr = &vacrel->dead_items->items[pindex]; + if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) + { + prev_prefetch = ItemPointerGetBlockNumber(ptr); + prefetch_budget -= 1; + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); + } + } + } + else if (pindex < vacrel->dead_items->num_items) + { + BlockNumber previous = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + while (++pindex < vacrel->dead_items->num_items) + { + BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + if (previous != toPrefetch) + { + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch); + break; + } + } + } + } + /* * Pin the visibility map page in case we need to mark the page * all-visible. In most cases this will be very cheap, because we'll diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 2e18cd88bcf..b3e2dfe4dfa 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -299,7 +299,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, * WAL record inserted above, so it would be incorrect to * update the heap page's LSN. */ - if (XLogHintBitIsNeeded()) + /* NEON: we have to update page LSN even if wal_log_hints=off + if (XLogHintBitIsNeeded()) + */ { Page heapPage = BufferGetPage(heapBuf); diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 52e646c7f75..bf63b519924 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1081,3 +1081,47 @@ item is irrelevant, and need not be stored at all. This arrangement corresponds to the fact that an L&Y non-leaf page has one more pointer than key. Suffix truncation's negative infinity attributes behave in the same way. + +Notes About Index Scan Prefetch +------------------------------- + +Prefetch can significantly improve the speed of OLAP queries. +To be able to perform prefetch, we need to know which pages will +be accessed during the scan. It is trivial for heap- and bitmap scans, +but requires more effort for index scans: to implement prefetch for +index scans, we need to find out subsequent leaf pages. + +Postgres links all pages at the same level of the B-Tree in a doubly linked list and uses this list for +forward and backward iteration. This list, however, can not trivially be used for prefetching because to locate the next page because we need first to load the current page. To prefetch more than only the next page, we can utilize the parent page's downlinks instead, as it contains references to most of the target page's sibling pages. + +Because Postgres' nbtree pages have no reference to their parent page, we need to remember the parent page when descending the btree and use it to prefetch subsequent pages. We will utilize the parent's linked list to improve the performance of this prefetch system past the key range of the parent page. + +We should prefetch not only leaf pages, but also the next parent page. +The trick is to correctly calculate the moment when it will be needed: +We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page. + +Currently there are two different prefetch implementations for +index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches +only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only +if parallel plan is not used. Parallel index scan is using critical section for obtaining next +page by parallel worker. Leaf page is loaded in this critical section. +And if most of time is spent in loading the page, then it actually eliminates any concurrency +and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in +any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0. + +Prefetch for normal (not index-only) index tries to prefetch heap tuples +referenced from leaf page. Average number of items per page +is about 100 which is comparable with default value of effective_io_concurrency. +So there is not so much sense trying to prefetch also next leaf page. + +As far as it is difficult to estimate number of entries traversed by index scan, +we prefer not to prefetch large number of pages from the very beginning. +Such useless prefetch can reduce the performance of point lookups. +Instead of it we start with smallest prefetch distance and increase it +by INCREASE_PREFETCH_DISTANCE_STEP after processing each item +until it reaches effective_io_concurrency. In case of index-only +scan we increase prefetch distance after processing each leaf pages +and for index scan - after processing each tuple. +The only exception is case when no key bounds are specified. +In this case we traverse the whole relation and it makes sense +to start with the largest possible prefetch distance from the very beginning. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index d33f814a938..358f8cd59d0 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -2165,7 +2165,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 6c5b5c69ce5..b939b9b6947 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -37,6 +37,7 @@ #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" +#include "utils/spccache.h" /* @@ -371,6 +372,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + so->prefetch_maximum = 0; /* disable prefetch */ /* * We don't know yet whether the scan will be index-only, so we do not @@ -912,6 +914,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTVacState vstate; BlockNumber num_pages; BlockNumber scanblkno; + BlockNumber prefetch_blkno; + int io_concurrency; bool needLock; /* @@ -951,6 +955,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.maxbufsize = 0; vstate.pendingpages = NULL; vstate.npendingpages = 0; + + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + /* Consider applying _bt_pendingfsm_finalize optimization */ _bt_pendingfsm_init(rel, &vstate, (callback == NULL)); @@ -982,6 +989,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, needLock = !RELATION_IS_LOCAL(rel); scanblkno = BTREE_METAPAGE + 1; + prefetch_blkno = scanblkno; + for (;;) { /* Get the current relation length */ @@ -998,9 +1007,19 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Quit if we've scanned the whole relation */ if (scanblkno >= num_pages) break; + + if (prefetch_blkno < scanblkno) + prefetch_blkno = scanblkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < scanblkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; scanblkno < num_pages; scanblkno++) { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++); + btvacuumpage(&vstate, scanblkno); if (info->report_progress) pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 3230b3b8940..cefb363acfe 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -18,11 +18,14 @@ #include "access/nbtree.h" #include "access/relscan.h" #include "access/xact.h" +#include "catalog/catalog.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "storage/predicate.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/spccache.h" static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); @@ -47,6 +50,7 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +#define INCREASE_PREFETCH_DISTANCE_STEP 1 /* * _bt_drop_lock_and_maybe_pin() @@ -847,6 +851,70 @@ _bt_compare(Relation rel, return 0; } + +/* + * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch. + * This functions returns offset of first item. + */ +static int +_bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + OffsetNumber n_child; + int next_parent_prefetch_index; + int i, j; + + buf = _bt_getbuf(rel, parent, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + offnum = P_FIRSTDATAKEY(opaque); + n_child = PageGetMaxOffsetNumber(page) - offnum + 1; + + /* Position where we should insert prefetch of parent page: we intentionally use prefetch_maximum here instead of current_prefetch_distance, + * assuming that it will reach prefetch_maximum before we reach and of the parent page + */ + next_parent_prefetch_index = (n_child > so->prefetch_maximum) + ? n_child - so->prefetch_maximum : 0; + + if (ScanDirectionIsForward(dir)) + { + so->next_parent = opaque->btpo_next; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + i); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + else + { + so->next_parent = opaque->btpo_prev; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + so->n_prefetch_blocks = j; + so->last_prefetch_index = 0; + _bt_relbuf(rel, buf); + return offnum; +} + /* * _bt_first() -- Find the first item in a scan. * @@ -1106,6 +1174,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } + /* Neon: initialize prefetch */ + so->n_prefetch_requests = 0; + so->n_prefetch_blocks = 0; + so->last_prefetch_index = 0; + so->next_parent = P_NONE; + so->prefetch_maximum = IsCatalogRelation(rel) + ? effective_io_concurrency + : get_tablespace_io_concurrency(rel->rd_rel->reltablespace); + + if (scan->xs_want_itup) /* index only scan */ + { + if (enable_indexonlyscan_prefetch) + { + /* We disable prefetch for parallel index-only scan. + * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker + * which issued prefetch request. The logic of splitting pages between parallel workers in + * index scan doesn't allow to satisfy this requirement. + * Also prefetch of leave pages will be useless if expected number of rows fits in one page. + */ + if (scan->parallel_scan) + so->prefetch_maximum = 0; /* disable prefetch */ + } + else + so->prefetch_maximum = 0; /* disable prefetch */ + } + else if (!enable_indexscan_prefetch || !scan->heapRelation) + so->prefetch_maximum = 0; /* disable prefetch */ + + /* If key bounds are not specified, then we will scan the whole relation and it make sense to start with the largest possible prefetch distance */ + so->current_prefetch_distance = (keysCount == 0) ? so->prefetch_maximum : 0; + /* * If we found no usable boundary keys, we have to start from one end of * the tree. Walk down that edge to the first or last key, and scan from @@ -1376,6 +1475,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ, scan->xs_snapshot); + /* Start prefetching for index only scan */ + if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */ + { + int first_offset = _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir); + int skip = ScanDirectionIsForward(dir) + ? stack->bts_offset - first_offset + : first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset; + Assert(so->n_prefetch_blocks >= skip); + so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; + so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); + so->last_prefetch_index = skip + so->n_prefetch_requests; + for (int j = skip; j < so->last_prefetch_index; j++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[j]); + } + /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1515,8 +1629,64 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_heaptid = currItem->heapTid; - if (scan->xs_want_itup) - scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + if (scan->xs_want_itup) /* index-only scan */ + { + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + } + else if (so->prefetch_maximum > 0) + { + int prefetchLimit, prefetchDistance; + + /* Neon: prefetch referenced heap pages. + * As far as it is difficult to predict how much items index scan will return + * we do not want to prefetch many heap pages from the very beginning because + * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP + * at each index scan iteration until it reaches prefetch_maximum. + */ + + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + else + so->current_prefetch_distance = so->prefetch_maximum; + + /* How much we can prefetch */ + prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1); + + /* Active prefeth requests */ + prefetchDistance = so->n_prefetch_requests; + + /* + * Consume one prefetch request (if any) + */ + if (prefetchDistance != 0) + prefetchDistance -= 1; + + /* Keep number of active prefetch requests equal to the current prefetch distance. + * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration, + * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations + */ + if (ScanDirectionIsForward(dir)) + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + else + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */ + } return true; } @@ -1924,6 +2094,30 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->markItemIndex = -1; } + if (scan->xs_want_itup && so->prefetch_maximum > 0) /* Prefetching of leaf pages for index-only scan */ + { + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + + so->n_prefetch_requests -= 1; /* we load next leaf page, so decrement number of active prefetch requests */ + + /* Check if the are more children to prefetch at current parent page */ + if (so->last_prefetch_index == so->n_prefetch_blocks && so->next_parent != P_NONE) + { + /* we have prefetched all items from current parent page, let's move to the next parent page */ + _bt_read_parent_for_prefetch(scan, so->next_parent, dir); + so->n_prefetch_requests -= 1; /* loading parent page consumes one more prefetch request */ + } + + /* Try to keep number of active prefetch requests equal to current prefetch distance */ + while (so->n_prefetch_requests < so->current_prefetch_distance && so->last_prefetch_index < so->n_prefetch_blocks) + { + so->n_prefetch_requests += 1; + PrefetchBuffer(scan->indexRelation, MAIN_FORKNUM, so->prefetch_blocks[so->last_prefetch_index++]); + } + } + if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ @@ -2328,7 +2522,7 @@ _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) */ Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, - Snapshot snapshot) + BlockNumber* parent, Snapshot snapshot) { Buffer buf; Page page; @@ -2336,6 +2530,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; + BlockNumber parent_blocknum = P_NONE; /* * If we are looking for a leaf page, okay to descend from fast root; @@ -2353,6 +2548,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, page = BufferGetPage(buf); TestForOldSnapshot(snapshot, rel, page); opaque = BTPageGetOpaque(page); + blkno = BufferGetBlockNumber(buf); for (;;) { @@ -2391,6 +2587,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, offnum = P_FIRSTDATAKEY(opaque); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + parent_blocknum = blkno; blkno = BTreeTupleGetDownLink(itup); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); @@ -2398,6 +2595,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, opaque = BTPageGetOpaque(page); } + if (parent) + *parent = parent_blocknum; + return buf; } @@ -2420,13 +2620,15 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) BTPageOpaque opaque; OffsetNumber start; BTScanPosItem *currItem; + BlockNumber parent; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ - buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), &parent, + scan->xs_snapshot); if (!BufferIsValid(buf)) { @@ -2439,6 +2641,17 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } + /* Start prefetching for index-only scan */ + if (so->prefetch_maximum > 0 && parent != P_NONE && scan->xs_want_itup) /* index only scan */ + { + _bt_read_parent_for_prefetch(scan, parent, dir); + so->n_prefetch_requests = so->last_prefetch_index = + Min(so->prefetch_maximum, so->n_prefetch_blocks); + + for (int i = 0; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 4443f1918df..7484149cbb7 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -85,6 +85,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + smgr_start_unlogged_build(index->rd_smgr); + /* * Initialize the meta page and root pages */ @@ -131,6 +133,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) SpGistUpdateMetaPage(index); + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if WAL-logging is * required, write all pages to the WAL now. @@ -140,8 +144,13 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, + MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM); } + smgr_end_unlogged_build(index->rd_smgr); + result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 8a5b540c809..71d770f0e44 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -27,6 +27,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "utils/snapmgr.h" +#include "utils/spccache.h" /* Entry in pending-list of TIDs we need to revisit */ @@ -796,7 +797,14 @@ spgvacuumscan(spgBulkDeleteState *bds) Relation index = bds->info->index; bool needLock; BlockNumber num_pages, - blkno; + blkno, + prefetch_blkno; + int io_concurrency; + + /* initiate concurrency */ + io_concurrency = get_tablespace_maintenance_io_concurrency( + index->rd_rel->reltablespace + ); /* Finish setting up spgBulkDeleteState */ initSpGistState(&bds->spgstate, index); @@ -824,6 +832,8 @@ spgvacuumscan(spgBulkDeleteState *bds) * in btvacuumscan(). */ blkno = SPGIST_METAPAGE_BLKNO + 1; + prefetch_blkno = blkno; + for (;;) { /* Get the current relation length */ @@ -836,9 +846,19 @@ spgvacuumscan(spgBulkDeleteState *bds) /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; + + if (prefetch_blkno < blkno) + prefetch_blkno = blkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < blkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno++); + spgvacuumpage(bds, blkno); /* empty the pending-list after each page */ if (bds->pendingList != NULL) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 71ac70fb40c..6e8fd983904 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -59,6 +59,7 @@ #include "pgstat.h" #include "storage/fd.h" #include "storage/shmem.h" +#include "storage/smgr.h" #define SlruFileName(ctl, path, seg) \ snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) @@ -617,6 +618,66 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) SlruInternalWritePage(ctl, slotno, NULL); } + +/* + * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer + * to download them on demand to reduce startup time. + * If SLRU segment is not found, we try to download it from page server + */ +static int +SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) +{ + int segno; + int fd = -1; + int n_blocks; + char* buffer; + + static SMgrRelationData dummy_smgr_rel = {0}; + + /* If page is greater than latest written page, then do not try to download segment from server */ + if (ctl->PagePrecedes(ctl->shared->latest_page_number, pageno)) + return -1; + + if (!dummy_smgr_rel.smgr) + { + RelFileLocator rlocator = {0}; + dummy_smgr_rel.smgr = smgr(InvalidBackendId, rlocator); + } + segno = pageno / SLRU_PAGES_PER_SEGMENT; + + buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT); + n_blocks = smgr_read_slru_segment(&dummy_smgr_rel, path, segno, buffer); + if (n_blocks > 0) + { + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + pfree(buffer); + return -1; + } + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); + if (pg_pwrite(fd, buffer, n_blocks*BLCKSZ, 0) != n_blocks*BLCKSZ) + { + pgstat_report_wait_end(); + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + slru_errcause = SLRU_WRITE_FAILED; + slru_errno = errno; + + CloseTransientFile(fd); + pfree(buffer); + return -1; + } + pgstat_report_wait_end(); + } + pfree(buffer); + return fd; +} + /* * Return whether the given page exists on disk. * @@ -644,12 +705,18 @@ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) { /* expected: file doesn't exist */ if (errno == ENOENT) - return false; - - /* report error normally */ - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - SlruReportIOError(ctl, pageno, 0); + { + fd = SimpleLruDownloadSegment(ctl, pageno, path); + if (fd < 0) + return false; + } + else + { + /* report error normally */ + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + SlruReportIOError(ctl, pageno, 0); + } } if ((endpos = lseek(fd, 0, SEEK_END)) < 0) @@ -703,18 +770,30 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); if (fd < 0) { - if (errno != ENOENT || !InRecovery) + if (errno != ENOENT) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } - - ereport(LOG, - (errmsg("file \"%s\" doesn't exist, reading as zeroes", - path))); - MemSet(shared->page_buffer[slotno], 0, BLCKSZ); - return true; + fd = SimpleLruDownloadSegment(ctl, pageno, path); + if (fd < 0) + { + if (!InRecovery) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + return false; + } + else + { + ereport(LOG, + (errmsg("file \"%s\" doesn't exist, reading as zeroes", + path))); + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + return true; + } + } } errno = 0; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 06235073ce5..715ba2d133e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -85,6 +85,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/buf_internals.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/large_object.h" @@ -138,6 +139,8 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +uint64 predefined_sysidentifier; +int lastWrittenLsnCacheSize; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -204,6 +207,25 @@ const struct config_enum_entry archive_mode_options[] = { {NULL, 0, false} }; +typedef struct LastWrittenLsnCacheEntry +{ + BufferTag key; + XLogRecPtr lsn; + /* double linked list for LRU replacement algorithm */ + dlist_node lru_node; +} LastWrittenLsnCacheEntry; + + +/* + * Cache of last written LSN for each relation page. + * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last + * relation metadata update. + * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"), + * pages are replaced using LRU algorithm, based on L2-list. + * Access to this cache is protected by 'LastWrittenLsnLock'. + */ +static HTAB *lastWrittenLsnCache; + /* * Statistics for current checkpoint are collected in this global struct. * Because only the checkpointer or a stand-alone backend can perform @@ -556,6 +578,26 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* + * Maximal last written LSN for pages not present in lastWrittenLsnCache + */ + XLogRecPtr maxLastWrittenLsn; + + /* + * Double linked list to implement LRU replacement policy for last written LSN cache. + * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'. + */ + dlist_head lastWrittenLsnLRU; + + /* neon: copy of startup's RedoStartLSN for walproposer's use */ + XLogRecPtr RedoStartLSN; + + /* + * size of a timeline in zenith pageserver. + * used to enforce timeline size limit. + */ + uint64 zenithCurrentClusterSize; + slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -638,6 +680,15 @@ static bool holdingAllLocks = false; static MemoryContext walDebugCxt = NULL; #endif + +/* + * Variables read from 'zenith.signal' file. + */ +bool ZenithRecoveryRequested = false; +XLogRecPtr zenithLastRec = InvalidXLogRecPtr; +bool zenithWriteOk = false; + + static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog, TimeLineID newTLI); @@ -648,6 +699,7 @@ static void CreateEndOfRecoveryRecord(void); static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, TimeLineID newTLI); +static void PreCheckPointGuts(int flags); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); @@ -3283,11 +3335,15 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, XLogFilePath(path, tli, *segno, wal_segment_size); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - if (!XLogCtl->InstallXLogFileSegmentActive) - { - LWLockRelease(ControlFileLock); - return false; + if (XLogCtl) + { + /* Neon: in case of sync-safekeepers shared memory is not inialized */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (!XLogCtl->InstallXLogFileSegmentActive) + { + LWLockRelease(ControlFileLock); + return false; + } } if (!find_free) @@ -3303,7 +3359,8 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, if ((*segno) >= max_segno) { /* Failed to find a free slot within specified range */ - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); return false; } (*segno)++; @@ -3314,12 +3371,14 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, Assert(access(path, F_OK) != 0 && errno == ENOENT); if (durable_rename(tmppath, path, LOG) != 0) { - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); /* durable_rename already emitted log message */ return false; } - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); return true; } @@ -4482,11 +4541,8 @@ GetActiveWalLevelOnStandby(void) return ControlFile->wal_level; } -/* - * Initialization of shared memory for XLOG - */ -Size -XLOGShmemSize(void) +static Size +XLOGCtlShmemSize(void) { Size size; @@ -4535,6 +4591,16 @@ XLOGShmemSize(void) return size; } +/* + * Initialization of shared memory for XLOG + */ +Size +XLOGShmemSize(void) +{ + return XLOGCtlShmemSize() + + hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry)); +} + void XLOGShmemInit(void) { @@ -4562,7 +4628,18 @@ XLOGShmemInit(void) XLogCtl = (XLogCtlData *) - ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); + ShmemInitStruct("XLOG Ctl", XLOGCtlShmemSize(), &foundXLog); + + if (lastWrittenLsnCacheSize > 0) + { + static HASHCTL info; + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(LastWrittenLsnCacheEntry); + lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache", + lastWrittenLsnCacheSize, lastWrittenLsnCacheSize, + &info, + HASH_ELEM | HASH_BLOBS); + } localControlFile = ControlFile; ControlFile = (ControlFileData *) @@ -4672,10 +4749,17 @@ BootStrapXLOG(void) * determine the initialization time of the installation, which could * perhaps be useful sometimes. */ - gettimeofday(&tv, NULL); - sysidentifier = ((uint64) tv.tv_sec) << 32; - sysidentifier |= ((uint64) tv.tv_usec) << 12; - sysidentifier |= getpid() & 0xFFF; + if (predefined_sysidentifier != 0) + { + sysidentifier = predefined_sysidentifier; + } + else + { + gettimeofday(&tv, NULL); + sysidentifier = ((uint64) tv.tv_sec) << 32; + sysidentifier |= ((uint64) tv.tv_usec) << 12; + sysidentifier |= getpid() & 0xFFF; + } /* page buffer must be aligned suitably for O_DIRECT */ buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ); @@ -5028,6 +5112,81 @@ CheckRequiredParameterValues(void) } } +static void +readZenithSignalFile(void) +{ + int fd; + + fd = BasicOpenFile(ZENITH_SIGNAL_FILE, O_RDONLY | PG_BINARY); + if (fd >= 0) + { + struct stat statbuf; + char *content; + char prev_lsn_str[20]; + + /* Slurp the file into a string */ + if (stat(ZENITH_SIGNAL_FILE, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + ZENITH_SIGNAL_FILE))); + content = palloc(statbuf.st_size + 1); + if (read(fd, content, statbuf.st_size) != statbuf.st_size) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + ZENITH_SIGNAL_FILE))); + content[statbuf.st_size] = '\0'; + + /* Parse it */ + if (sscanf(content, "PREV LSN: %19s", prev_lsn_str) != 1) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE))); + + if (strcmp(prev_lsn_str, "invalid") == 0) + { + /* No prev LSN. Forbid starting up in read-write mode */ + zenithLastRec = InvalidXLogRecPtr; + zenithWriteOk = false; + } + else if (strcmp(prev_lsn_str, "none") == 0) + { + /* + * The page server had no valid prev LSN, but assured that it's ok + * to start without it. This happens when you start the compute + * node for the first time on a new branch. + */ + zenithLastRec = InvalidXLogRecPtr; + zenithWriteOk = true; + } + else + { + uint32 hi, + lo; + + if (sscanf(prev_lsn_str, "%X/%X", &hi, &lo) != 2) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE))); + zenithLastRec = ((uint64) hi) << 32 | lo; + + /* If prev LSN is given, it better be valid */ + if (zenithLastRec == InvalidXLogRecPtr) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid prev-LSN in file \"%s\"", ZENITH_SIGNAL_FILE))); + zenithWriteOk = true; + } + ZenithRecoveryRequested = true; + close(fd); + + elog(LOG, + "[ZENITH] found 'zenith.signal' file. setting prev LSN to %X/%X", + LSN_FORMAT_ARGS(zenithLastRec)); + } +} + /* * This must be called ONCE during postmaster or standalone-backend startup */ @@ -5059,10 +5218,15 @@ StartupXLOG(void) CurrentResourceOwner == AuxProcessResourceOwner); CurrentResourceOwner = AuxProcessResourceOwner; + /* + * Read zenith.signal before anything else. + */ + readZenithSignalFile(); + /* * Check that contents look valid. */ - if (!XRecOffIsValid(ControlFile->checkPoint)) + if (!XRecOffIsValid(ControlFile->checkPoint) && !ZenithRecoveryRequested) ereport(FATAL, (errmsg("control file contains invalid checkpoint location"))); @@ -5291,6 +5455,14 @@ StartupXLOG(void) RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; doPageWrites = lastFullPageWrites; + /* + * Setup last written lsn cache, max written LSN. + * Starting from here, we could be modifying pages through REDO, which requires + * the existance of maxLwLsn + LwLsn LRU. + */ + XLogCtl->maxLastWrittenLsn = RedoRecPtr; + dlist_init(&XLogCtl->lastWrittenLsnLRU); + /* REDO */ if (InRecovery) { @@ -6083,6 +6255,186 @@ GetInsertRecPtr(void) return recptr; } +/* + * GetLastWrittenLSN -- Returns maximal LSN of written page. + * It returns an upper bound for the last written LSN of a given page, + * either from a cached last written LSN or a global maximum last written LSN. + * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. + * If cache is large enough, iterating through all hash items may be rather expensive. + * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical. + */ +XLogRecPtr +GetLastWrittenLSN(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno) +{ + XLogRecPtr lsn; + LastWrittenLsnCacheEntry* entry; + + Assert(lastWrittenLsnCacheSize != 0); + + LWLockAcquire(LastWrittenLsnLock, LW_SHARED); + + /* Maximal last written LSN among all non-cached pages */ + lsn = XLogCtl->maxLastWrittenLsn; + + if (rlocator.relNumber != InvalidOid) + { + BufferTag key; + key.spcOid = rlocator.spcOid; + key.dbOid = rlocator.dbOid; + key.relNumber = rlocator.relNumber; + key.forkNum = forknum; + key.blockNum = blkno; + entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); + if (entry != NULL) + lsn = entry->lsn; + } + else + { + HASH_SEQ_STATUS seq; + /* Find maximum of all cached LSNs */ + hash_seq_init(&seq, lastWrittenLsnCache); + while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) + { + if (entry->lsn > lsn) + lsn = entry->lsn; + } + } + LWLockRelease(LastWrittenLsnLock); + + return lsn; +} + +/* + * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range. + * We maintain cache of last written LSNs with limited size and LRU replacement + * policy. Keeping last written LSN for each page allows to use old LSN when + * requesting pages of unchanged or appended relations. Also it is critical for + * efficient work of prefetch in case massive update operations (like vacuum or remove). + * + * rlocator.relNumber can be InvalidOid, in this case maxLastWrittenLsn is updated. + * SetLastWrittenLsn with dummy rlocator is used by createdb and dbase_redo functions. + */ +void +SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileLocator rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) +{ + if (lsn == InvalidXLogRecPtr || n_blocks == 0 || lastWrittenLsnCacheSize == 0) + return; + + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + if (rlocator.relNumber == InvalidOid) + { + if (lsn > XLogCtl->maxLastWrittenLsn) + XLogCtl->maxLastWrittenLsn = lsn; + } + else + { + LastWrittenLsnCacheEntry* entry; + BufferTag key; + bool found; + BlockNumber i; + + key.spcOid = rlocator.spcOid; + key.dbOid = rlocator.dbOid; + key.relNumber = rlocator.relNumber; + key.forkNum = forknum; + for (i = 0; i < n_blocks; i++) + { + key.blockNum = from + i; + entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); + if (found) + { + if (lsn > entry->lsn) + entry->lsn = lsn; + /* Unlink from LRU list */ + dlist_delete(&entry->lru_node); + } + else + { + entry->lsn = lsn; + if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize) + { + /* Replace least recently used entry */ + LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&XLogCtl->lastWrittenLsnLRU)); + /* Adjust max LSN for not cached relations/chunks if needed */ + if (victim->lsn > XLogCtl->maxLastWrittenLsn) + XLogCtl->maxLastWrittenLsn = victim->lsn; + + hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); + } + } + /* Link to the end of LRU list */ + dlist_push_tail(&XLogCtl->lastWrittenLsnLRU, &entry->lru_node); + } + } + LWLockRelease(LastWrittenLsnLock); +} + +/* + * SetLastWrittenLSNForBlock -- Set maximal LSN for block + */ +void +SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno) +{ + SetLastWrittenLSNForBlockRange(lsn, rlocator, forknum, blkno, 1); +} + +/* + * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata + */ +void +SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileLocator rlocator, ForkNumber forknum) +{ + SetLastWrittenLSNForBlock(lsn, rlocator, forknum, REL_METADATA_PSEUDO_BLOCKNO); +} + +/* + * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database + */ +void +SetLastWrittenLSNForDatabase(XLogRecPtr lsn) +{ + RelFileLocator dummyNode = {InvalidOid, InvalidOid, InvalidOid}; + SetLastWrittenLSNForBlock(lsn, dummyNode, MAIN_FORKNUM, 0); +} + +void +SetRedoStartLsn(XLogRecPtr RedoStartLSN) +{ + XLogCtl->RedoStartLSN = RedoStartLSN; +} + +/* + * RedoStartLsn is set only once by startup process, locking is not required + * after its exit. + */ +XLogRecPtr +GetRedoStartLsn(void) +{ + return XLogCtl->RedoStartLSN; +} + + +uint64 +GetZenithCurrentClusterSize(void) +{ + uint64 size; + SpinLockAcquire(&XLogCtl->info_lck); + size = XLogCtl->zenithCurrentClusterSize; + SpinLockRelease(&XLogCtl->info_lck); + + return size; +} + + +void +SetZenithCurrentClusterSize(uint64 size) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->zenithCurrentClusterSize = size; + SpinLockRelease(&XLogCtl->info_lck); +} + + /* * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL * position known to be fsync'd to disk. This should only be used on a @@ -6091,8 +6443,6 @@ GetInsertRecPtr(void) XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI) { - Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE); - SpinLockAcquire(&XLogCtl->info_lck); LogwrtResult = XLogCtl->LogwrtResult; SpinLockRelease(&XLogCtl->info_lck); @@ -6498,6 +6848,11 @@ CreateCheckPoint(int flags) */ SyncPreCheckpoint(); + /* + * NEON: perform checkpiont action requiring write to the WAL before we determine the REDO pointer. + */ + PreCheckPointGuts(flags); + /* * Use a critical section to force system panic if we have trouble. */ @@ -7007,6 +7362,28 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, return recptr; } +static void +CheckPointReplicationState(void) +{ + CheckPointRelationMap(); + CheckPointReplicationSlots(); + CheckPointSnapBuild(); + CheckPointLogicalRewriteHeap(); + CheckPointReplicationOrigin(); +} + +/* + * NEON: we use logical records to persist information of about slots, origins, relation map... + * If it is done inside shutdown checkpoint, then Postgres panics: "concurrent write-ahead log activity while database system is shutting down" + * So it before checkpoint REDO position is determined. + */ +static void +PreCheckPointGuts(int flags) +{ + if (flags & CHECKPOINT_IS_SHUTDOWN) + CheckPointReplicationState(); +} + /* * Flush all data in shared memory to disk, and fsync * @@ -7016,11 +7393,8 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { - CheckPointRelationMap(); - CheckPointReplicationSlots(); - CheckPointSnapBuild(); - CheckPointLogicalRewriteHeap(); - CheckPointReplicationOrigin(); + if (!(flags & CHECKPOINT_IS_SHUTDOWN)) + CheckPointReplicationState(); /* Write out all dirty data in SLRUs and the main buffer pool */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); @@ -7963,6 +8337,7 @@ xlog_redo(XLogReaderState *record) for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) { Buffer buffer; + XLogRedoAction result; if (!XLogRecHasBlockImage(record, block_id)) { @@ -7971,9 +8346,23 @@ xlog_redo(XLogReaderState *record) continue; } - if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) + result = XLogReadBufferForRedo(record, block_id, &buffer); + + if (result == BLK_DONE && (!IsUnderPostmaster || StandbyMode)) + { + /* + * NEON: In the special WAL redo process, blocks that are being + * ignored return BLK_DONE. Accept that. + * Additionally, in standby mode, blocks that are not present + * in shared buffers are ignored during replay, so we also + * ignore those blocks. + */ + } + else if (result != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); - UnlockReleaseBuffer(buffer); + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); } } else if (info == XLOG_BACKUP_END) @@ -8979,3 +9368,29 @@ SetWalWriterSleeping(bool sleeping) XLogCtl->WalWriterSleeping = sleeping; SpinLockRelease(&XLogCtl->info_lck); } + +void +XLogUpdateWalBuffers(char* data, XLogRecPtr start, size_t len) +{ + XLogRecPtr end; + int idx; + XLogRecPtr pagebegptr; + + LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); + + end = start + len; + idx = XLogRecPtrToBufIdx(end); + pagebegptr = XLogCtl->xlblocks[idx] - XLOG_BLCKSZ; + + if (pagebegptr + XLOG_BLCKSZ >= end && pagebegptr < end) + { + /* Last page of the segment is present in WAL buffers */ + char* page = &XLogCtl->pages[idx * XLOG_BLCKSZ]; + size_t overlap = end - pagebegptr; + if (overlap <= len) + memcpy(page, data + len - overlap, overlap); + else + memcpy(page + overlap - len, data, len); + } + LWLockRelease(WALBufMappingLock); +} diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 258cbd70355..1fc2159b193 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -37,9 +37,11 @@ #include "miscadmin.h" #include "pg_trace.h" #include "replication/origin.h" +#include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" +#include "utils/wait_event.h" /* * Guess the maximum buffer size required to store a compressed version of @@ -87,6 +89,11 @@ typedef struct char compressed_page[COMPRESS_BUFSIZE]; } registered_buffer; +/* GUCs */ +int max_replication_apply_lag; +int max_replication_flush_lag; +int max_replication_write_lag; + static registered_buffer *registered_buffers; static int max_registered_buffers; /* allocated size */ static int max_registered_block_id = 0; /* highest block_id + 1 currently @@ -488,6 +495,11 @@ XLogInsert(RmgrId rmid, uint8 info) return EndPos; } + if (delay_backend_us != NULL && delay_backend_us() > 0) + { + InterruptPending = true; + } + do { XLogRecPtr RedoRecPtr; @@ -644,22 +656,30 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, */ if (regbuf->flags & REGBUF_STANDARD) { - /* Assume we can omit data between pd_lower and pd_upper */ - uint16 lower = ((PageHeader) page)->pd_lower; - uint16 upper = ((PageHeader) page)->pd_upper; - - if (lower >= SizeOfPageHeaderData && - upper > lower && - upper <= BLCKSZ) + if (PageIsNew(page)) { - bimg.hole_offset = lower; - cbimg.hole_length = upper - lower; + bimg.hole_offset = 0; + cbimg.hole_length = BLCKSZ; } else { - /* No "hole" to remove */ - bimg.hole_offset = 0; - cbimg.hole_length = 0; + /* Assume we can omit data between pd_lower and pd_upper */ + uint16 lower = ((PageHeader) page)->pd_lower; + uint16 upper = ((PageHeader) page)->pd_upper; + + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) + { + bimg.hole_offset = lower; + cbimg.hole_length = upper - lower; + } + else + { + /* No "hole" to remove */ + bimg.hole_offset = 0; + cbimg.hole_length = 0; + } } } else @@ -753,6 +773,11 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, rdt_datas_last->data = page; rdt_datas_last->len = BLCKSZ; } + else if (bimg.length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = 0; + } else { /* must skip the hole */ diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 539928cb854..1712dc6c7c1 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -721,8 +721,10 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * We could try to have a fast path for repeated references to the * same relation (with some scheme to handle invalidations * safely), but for now we'll call smgropen() every time. + * + * Only permanent relations are WAL-logged, so RELPERSISTENCE_PERMANENT. */ - reln = smgropen(block->rlocator, InvalidBackendId); + reln = smgropen(block->rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * If the relation file doesn't exist on disk, for example because diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 85745fa3bf5..582708a1206 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -232,7 +232,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) { - Assert(!XLogRecPtrIsInvalid(RecPtr)); + Assert(!XLogRecPtrIsInvalid(RecPtr) || state->skip_lsn_checks); ResetDecoder(state); @@ -256,6 +256,14 @@ XLogReleasePreviousRecord(XLogReaderState *state) if (!state->record) return InvalidXLogRecPtr; +#define SKIP_INVALID_RECORD(rec_ptr) do { \ + rec_ptr = MAXALIGN(rec_ptr + 1); \ + if (rec_ptr % XLOG_BLCKSZ <= MAXALIGN(1)) \ + goto restart; \ + else \ + goto skip_invalid; \ + } while (0); + /* * Remove it from the decoded record queue. It must be the oldest item * decoded, decode_queue_head. @@ -436,7 +444,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg) * Return NULL if there is no space in the decode buffer and allow_oversized * is false, or if memory allocation fails for an oversized buffer. */ -static DecodedXLogRecord * +DecodedXLogRecord * XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized) { size_t required_space = DecodeXLogRecordRequiredSpace(xl_tot_len); @@ -579,7 +587,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) * valid record starting position or alternatively to the beginning of * a page. See the header comments for XLogBeginRead. */ - Assert(RecPtr % XLOG_BLCKSZ == 0 || XRecOffIsValid(RecPtr)); + Assert(RecPtr % XLOG_BLCKSZ == 0 || XRecOffIsValid(RecPtr) || state->skip_lsn_checks); randAccess = true; } @@ -618,18 +626,24 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) } else if (targetRecOff < pageHeaderSize) { - report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u", - LSN_FORMAT_ARGS(RecPtr), - pageHeaderSize, targetRecOff); - goto err; + if(!state->skip_page_validation) + { + report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + pageHeaderSize, targetRecOff); + goto err; + } } if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { - report_invalid_record(state, "contrecord is requested by %X/%X", - LSN_FORMAT_ARGS(RecPtr)); - goto err; + if(!state->skip_page_validation) + { + report_invalid_record(state, "contrecord is requested by %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } } /* ReadPageInternal has verified the page header */ @@ -644,6 +658,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) * cannot access any other fields until we've verified that we got the * whole header. */ +skip_invalid: record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); total_len = record->xl_tot_len; @@ -659,7 +674,13 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) { if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, randAccess)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } + gotheader = true; } else @@ -667,11 +688,16 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* There may be no next page if it's too small. */ if (total_len < SizeOfXLogRecord) { - report_invalid_record(state, - "invalid record length at %X/%X: expected at least %u, got %u", - LSN_FORMAT_ARGS(RecPtr), - (uint32) SizeOfXLogRecord, total_len); - goto err; + if(!state->skip_invalid_records) + { + report_invalid_record(state, + "invalid record length at %X/%X: expected at least %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, total_len); + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* We'll validate the header once we have the next page. */ gotheader = false; @@ -756,10 +782,15 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Check that the continuation on next page looks valid */ if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { - report_invalid_record(state, - "there is no contrecord flag at %X/%X", - LSN_FORMAT_ARGS(RecPtr)); - goto err; + if(!state->skip_invalid_records) + { + report_invalid_record(state, + "there is no contrecord flag at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* @@ -769,12 +800,17 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (pageHeader->xlp_rem_len == 0 || total_len != (pageHeader->xlp_rem_len + gotlen)) { - report_invalid_record(state, - "invalid contrecord length %u (expected %lld) at %X/%X", - pageHeader->xlp_rem_len, - ((long long) total_len) - gotlen, - LSN_FORMAT_ARGS(RecPtr)); - goto err; + if(!state->skip_invalid_records) + { + report_invalid_record(state, + "invalid contrecord length %u (expected %lld) at %X/%X", + pageHeader->xlp_rem_len, + ((long long) total_len) - gotlen, + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* Append the continuation from this page to the buffer */ @@ -805,7 +841,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) record = (XLogRecord *) state->readRecordBuf; if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, randAccess)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } gotheader = true; } @@ -835,7 +876,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) record = (XLogRecord *) state->readRecordBuf; if (!ValidXLogRecord(state, record, RecPtr)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); state->DecodeRecPtr = RecPtr; @@ -854,7 +900,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Record does not cross a page boundary */ if (!ValidXLogRecord(state, record, RecPtr)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } state->NextRecPtr = RecPtr + MAXALIGN(total_len); @@ -1051,7 +1102,8 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) Assert(readLen == XLOG_BLCKSZ); if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, - state->readBuf)) + state->readBuf) && + !state->skip_page_validation) goto err; } @@ -1092,7 +1144,8 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) /* * Now that we know we have the full header, validate it. */ - if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) + if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr) && + !state->skip_page_validation) goto err; /* update read state information */ @@ -1151,7 +1204,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * We can't exactly verify the prev-link, but surely it should be less * than the record's own address. */ - if (!(record->xl_prev < RecPtr)) + if (!(record->xl_prev < RecPtr) && !state->skip_lsn_checks) { report_invalid_record(state, "record with incorrect prev-link %X/%X at %X/%X", @@ -1167,7 +1220,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * check guards against torn WAL pages where a stale but valid-looking * WAL record starts on a sector boundary. */ - if (record->xl_prev != PrevRecPtr) + if (record->xl_prev != PrevRecPtr && !state->skip_lsn_checks) { report_invalid_record(state, "record with incorrect prev-link %X/%X at %X/%X", @@ -1312,7 +1365,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, * check typically fails when an old WAL segment is recycled, and hasn't * yet been overwritten with new data yet. */ - if (hdr->xlp_pageaddr != recptr) + if (hdr->xlp_pageaddr != recptr && !state->skip_lsn_checks) { char fname[MAXFNAMELEN]; @@ -1815,7 +1868,9 @@ DecodeXLogRecord(XLogReaderState *state, if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) && (blk->hole_offset == 0 || blk->hole_length == 0 || - blk->bimg_len == BLCKSZ)) + blk->bimg_len == BLCKSZ) && + !(blk->hole_offset == 0 && + blk->hole_length == BLCKSZ)) /* null page */ { report_invalid_record(state, "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 3c7fb913e7e..3f16d5779fd 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -338,6 +338,7 @@ typedef struct XLogRecoveryCtlData XLogRecPtr lastReplayedReadRecPtr; /* start position */ XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */ TimeLineID lastReplayedTLI; /* timeline */ + ConditionVariable replayProgressCV; /* CV for waiters */ /* * When we're currently replaying a record, ie. in a redo function, @@ -467,6 +468,7 @@ XLogRecoveryShmemInit(void) SpinLockInit(&XLogRecoveryCtl->info_lck); InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + ConditionVariableInit(&XLogRecoveryCtl->replayProgressCV); ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV); } @@ -488,6 +490,64 @@ EnableStandbyMode(void) disable_startup_progress_timeout(); } +/* + * Wait for recovery to complete replaying all WAL up to and including + * redoEndRecPtr. + * + * This gets woken up for every WAL record replayed, so make sure you're not + * trying to wait an LSN that is too far in the future. + */ +void +XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr) +{ + static XLogRecPtr replayRecPtr = 0; + + if (!RecoveryInProgress()) + return; + + /* + * Check the backend-local variable first, we may be able to skip accessing + * shared memory (which requires locking) + */ + if (redoEndRecPtr <= replayRecPtr) + return; + + replayRecPtr = GetXLogReplayRecPtr(NULL); + + /* + * Check again if we're going to need to wait, now that we've updated + * the local cached variable. + */ + if (redoEndRecPtr <= replayRecPtr) + return; + + /* + * We need to wait for the variable, so prepare for that. + * + * Note: This wakes up every time a WAL record is replayed, so this can + * be expensive. + */ + ConditionVariablePrepareToSleep(&XLogRecoveryCtl->replayProgressCV); + + while (redoEndRecPtr > replayRecPtr) + { + bool timeout; + timeout = ConditionVariableTimedSleep(&XLogRecoveryCtl->replayProgressCV, + 10000000, /* 10 seconds */ + WAIT_EVENT_RECOVERY_WAL_STREAM); + + replayRecPtr = GetXLogReplayRecPtr(NULL); + + if (timeout) + ereport(LOG, + (errmsg("Waiting for recovery to catch up to %X/%X (currently %X/%X)", + LSN_FORMAT_ARGS(redoEndRecPtr), + LSN_FORMAT_ARGS(replayRecPtr)))); + } + + ConditionVariableCancelSleep(); +} + /* * Prepare the system for WAL recovery, if needed. * @@ -564,6 +624,9 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) ereport(LOG, (errmsg("starting point-in-time recovery to earliest consistent point"))); + else if (ZenithRecoveryRequested) + ereport(LOG, + (errmsg("starting zenith recovery"))); else ereport(LOG, (errmsg("starting archive recovery"))); @@ -720,6 +783,33 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, /* tell the caller to delete it later */ haveBackupLabel = true; } + else if (ZenithRecoveryRequested) + { + /* + * Zenith hacks to spawn compute node without WAL. Pretend that we + * just finished reading the record that started at 'zenithLastRec' + * and ended at checkpoint.redo + */ + elog(LOG, "starting with zenith basebackup at LSN %X/%X, prev %X/%X", + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + LSN_FORMAT_ARGS(zenithLastRec)); + + CheckPointLoc = zenithLastRec; + CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; + RedoStartLSN = ControlFile->checkPointCopy.redo; + // FIXME needs review. rebase of ff41b709abea6a9c42100a4fcb0ff434b2c846c9 + // Is it still relevant? + /* make basebackup LSN available for walproposer */ + SetRedoStartLsn(RedoStartLSN); + //EndRecPtr = ControlFile->checkPointCopy.redo; + + memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint)); + wasShutdown = true; + + /* Initialize expectedTLEs, like ReadRecord() does */ + expectedTLEs = readTimeLineHistory(checkPoint.ThisTimeLineID); + XLogPrefetcherBeginRead(xlogprefetcher, ControlFile->checkPointCopy.redo); + } else { /* @@ -791,6 +881,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, CheckPointLoc = ControlFile->checkPoint; CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; RedoStartLSN = ControlFile->checkPointCopy.redo; + SetRedoStartLsn(RedoStartLSN); RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID; record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, CheckPointTLI); @@ -880,7 +971,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, (errmsg("invalid next transaction ID"))); /* sanity check */ - if (checkPoint.redo > CheckPointLoc) + if (checkPoint.redo > CheckPointLoc && !ZenithRecoveryRequested) ereport(PANIC, (errmsg("invalid redo in checkpoint record"))); @@ -1478,8 +1569,13 @@ FinishWalRecovery(void) lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr; lastRecTLI = XLogRecoveryCtl->lastReplayedTLI; } - XLogPrefetcherBeginRead(xlogprefetcher, lastRec); - (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + + if (!ZenithRecoveryRequested) + { + XLogPrefetcherBeginRead(xlogprefetcher, lastRec); + (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + } + endOfLog = xlogreader->EndRecPtr; /* @@ -1513,11 +1609,55 @@ FinishWalRecovery(void) } } + /* + * When starting from a zenith base backup, we don't have WAL. Initialize + * the WAL page where we will start writing new records from scratch, + * instead. + */ + if (ZenithRecoveryRequested) + { + if (!zenithWriteOk) + { + /* + * We cannot start generating new WAL if we don't have a valid prev-LSN + * to use for the first new WAL record. (Shouldn't happen.) + */ + ereport(ERROR, + (errmsg("cannot start in read-write mode from this base backup"))); + } + else + { + int offs = endOfLog % XLOG_BLCKSZ; + char *page = palloc0(offs); + XLogRecPtr pageBeginPtr = endOfLog - offs; + int lastPageSize = ((pageBeginPtr % wal_segment_size) == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD; + + XLogPageHeader xlogPageHdr = (XLogPageHeader) (page); + + xlogPageHdr->xlp_pageaddr = pageBeginPtr; + xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC; + xlogPageHdr->xlp_tli = recoveryTargetTLI; + /* + * If we start writing with offset from page beginning, pretend in + * page header there is a record ending where actual data will + * start. + */ + xlogPageHdr->xlp_rem_len = offs - lastPageSize; + xlogPageHdr->xlp_info = (xlogPageHdr->xlp_rem_len > 0) ? XLP_FIRST_IS_CONTRECORD : 0; + readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size); + + result->lastPageBeginPtr = pageBeginPtr; + result->lastPage = page; + elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); + + // FIXME: should we unlink zenith.signal? + } + } /* * Copy the last partial block to the caller, for initializing the WAL * buffer for appending new WAL. */ - if (endOfLog % XLOG_BLCKSZ != 0) + else if (endOfLog % XLOG_BLCKSZ != 0) { char *page; int len; @@ -1569,7 +1709,10 @@ ShutdownWalRecovery(void) char recoveryPath[MAXPGPATH]; /* Final update of pg_stat_recovery_prefetch. */ - XLogPrefetcherComputeStats(xlogprefetcher); + if (!ZenithRecoveryRequested) + { + XLogPrefetcherComputeStats(xlogprefetcher); + } /* Shut down xlogreader */ if (readFile >= 0) @@ -1578,7 +1721,11 @@ ShutdownWalRecovery(void) readFile = -1; } XLogReaderFree(xlogreader); - XLogPrefetcherFree(xlogprefetcher); + + if (!ZenithRecoveryRequested) + { + XLogPrefetcherFree(xlogprefetcher); + } if (ArchiveRecoveryRequested) { @@ -1668,7 +1815,10 @@ PerformWalRecovery(void) else { /* just have to read next record after CheckPoint */ - Assert(xlogreader->ReadRecPtr == CheckPointLoc); + if (ZenithRecoveryRequested) + xlogreader->ReadRecPtr = CheckPointLoc; + else + Assert(xlogreader->ReadRecPtr == CheckPointLoc); replayTLI = CheckPointTLI; record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); } @@ -2012,6 +2162,8 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl /* Reset the prefetcher. */ XLogPrefetchReconfigure(); } + + ConditionVariableBroadcast(&XLogRecoveryCtl->replayProgressCV); } /* diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 43f7b31205d..68df6a3b56c 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -33,6 +33,8 @@ #include "utils/rel.h" +bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + /* GUC variable */ bool ignore_invalid_pages = false; @@ -373,6 +375,21 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, block_id); } + if (redo_read_buffer_filter && redo_read_buffer_filter(record, block_id)) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + { + *buf = ReadBufferWithoutRelcache(rlocator, forknum, + blkno, mode, NULL, true); + return BLK_DONE; + } + else + { + *buf = InvalidBuffer; + return BLK_DONE; + } + } + /* * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. @@ -491,7 +508,7 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, } /* Open the relation at smgr level */ - smgr = smgropen(rlocator, InvalidBackendId); + smgr = smgropen(rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * Create the target file if it doesn't already exist. This lets us cope diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 49e956b2c57..2ef26efc816 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -47,6 +47,7 @@ uint32 bootstrap_data_checksum_version = 0; /* No checksum */ +extern uint64 predefined_sysidentifier; static void CheckerModeMain(void); static void bootstrap_signals(void); @@ -221,13 +222,23 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) argv++; argc--; - while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:X:-:")) != -1) + while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:s:X:-:")) != -1) { switch (flag) { case 'B': SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; + case 's': + { + char* endptr; +#ifdef HAVE_STRTOULL + predefined_sysidentifier = strtoull(optarg, &endptr, 10); +#else + predefined_sysidentifier = strtoul(optarg, &endptr, 10); +#endif + break; + } case 'c': case '-': { diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 2add0534891..c6afda4e64d 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -145,7 +145,7 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, return NULL; /* placate compiler */ } - srel = smgropen(rlocator, backend); + srel = smgropen(rlocator, backend, relpersistence); smgrcreate(srel, MAIN_FORKNUM, false); if (needs_wal) @@ -185,6 +185,7 @@ void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum) { xl_smgr_create xlrec; + XLogRecPtr lsn; /* * Make an XLOG entry reporting the file creation. @@ -194,7 +195,8 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum) XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); - XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); + lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); + SetLastWrittenLSNForRelation(lsn, *rlocator, forkNum); } /* @@ -678,7 +680,7 @@ smgrDoPendingDeletes(bool isCommit) { SMgrRelation srel; - srel = smgropen(pending->rlocator, pending->backend); + srel = smgropen(pending->rlocator, pending->backend, 0); /* allocate the initial array, or extend it, if needed */ if (maxrels == 0) @@ -759,7 +761,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) BlockNumber total_blocks = 0; SMgrRelation srel; - srel = smgropen(pendingsync->rlocator, InvalidBackendId); + srel = smgropen(pendingsync->rlocator, InvalidBackendId, 0); /* * We emit newpage WAL records for smaller relations. @@ -968,7 +970,7 @@ smgr_redo(XLogReaderState *record) xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; - reln = smgropen(xlrec->rlocator, InvalidBackendId); + reln = smgropen(xlrec->rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT); smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) @@ -981,7 +983,7 @@ smgr_redo(XLogReaderState *record) int nforks = 0; bool need_fsm_vacuum = false; - reln = smgropen(xlrec->rlocator, InvalidBackendId); + reln = smgropen(xlrec->rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * Forcibly create relation if it doesn't exist (which suggests that diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index aa48fdf8d21..2bf4a34dd54 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -276,7 +276,7 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath) rlocator.dbOid = dbid; rlocator.relNumber = relfilenumber; - smgr = smgropen(rlocator, InvalidBackendId); + smgr = smgropen(rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT); nblocks = smgrnblocks(smgr, MAIN_FORKNUM); smgrclose(smgr); @@ -1461,6 +1461,11 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace, dst_deftablespace); + /* + * Update global last written LSN after wal-logging create database command + */ + SetLastWrittenLSNForDatabase(XactLastRecEnd); + /* * Close pg_database, but keep lock till commit. */ @@ -2102,6 +2107,7 @@ movedb(const char *dbname, const char *tblspcname) */ { xl_dbase_create_file_copy_rec xlrec; + XLogRecPtr lsn; xlrec.db_id = db_id; xlrec.tablespace_id = dst_tblspcoid; @@ -2112,8 +2118,10 @@ movedb(const char *dbname, const char *tblspcname) XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_create_file_copy_rec)); - (void) XLogInsert(RM_DBASE_ID, - XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE); + lsn = XLogInsert(RM_DBASE_ID, + XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE); + // TODO: Do we really need to set the LSN here? + SetLastWrittenLSNForDatabase(lsn); } /* @@ -3253,6 +3261,15 @@ dbase_redo(XLogReaderState *record) */ copydir(src_path, dst_path, false); + /* + * Make sure any future requests to the page server see the new + * database. + */ + { + XLogRecPtr lsn = record->EndRecPtr; + SetLastWrittenLSNForDatabase(lsn); + } + pfree(src_path); pfree(dst_path); } @@ -3273,6 +3290,16 @@ dbase_redo(XLogReaderState *record) /* Create the database directory with the version file. */ CreateDirAndVersionFile(dbpath, xlrec->db_id, xlrec->tablespace_id, true); + + /* + * Make sure any future requests to the page server see the new + * database. + */ + { + XLogRecPtr lsn = record->EndRecPtr; + SetLastWrittenLSNForDatabase(lsn); + } + pfree(dbpath); } else if (info == XLOG_DBASE_DROP) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 6c2e5c8a4f9..eca23de3ae3 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -47,7 +47,6 @@ ExplainOneQuery_hook_type ExplainOneQuery_hook = NULL; /* Hook for plugins to get control in explain_get_index_name() */ explain_get_index_name_hook_type explain_get_index_name_hook = NULL; - /* OR-able flags for ExplainXMLTag() */ #define X_OPENING 0 #define X_CLOSING 1 @@ -121,6 +120,7 @@ static void show_eval_params(Bitmapset *bms_params, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning); +static void show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir, ExplainState *es); @@ -186,6 +186,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt, es->costs = defGetBoolean(opt); else if (strcmp(opt->defname, "buffers") == 0) es->buffers = defGetBoolean(opt); + else if (strcmp(opt->defname, "prefetch") == 0) + es->prefetch = defGetBoolean(opt); else if (strcmp(opt->defname, "wal") == 0) es->wal = defGetBoolean(opt); else if (strcmp(opt->defname, "settings") == 0) @@ -543,7 +545,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, else if (es->analyze) instrument_option |= INSTRUMENT_ROWS; - if (es->buffers) + if (es->buffers || es->prefetch) instrument_option |= INSTRUMENT_BUFFERS; if (es->wal) instrument_option |= INSTRUMENT_WAL; @@ -2102,6 +2104,10 @@ ExplainNode(PlanState *planstate, List *ancestors, if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->walusage); + /* Show prefetch usage */ + if (es->prefetch && planstate->instrument) + show_prefetch_info(es, &planstate->instrument->bufusage.prefetch); + /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) { @@ -3536,6 +3542,34 @@ explain_get_index_name(Oid indexId) return result; } +/* + * Show prefetch statistics + */ +static void +show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info) +{ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Prefetch: hits=%lld misses=%lld expired=%lld duplicates=%lld\n", + (long long) prefetch_info->hits, + (long long) prefetch_info->misses, + (long long) prefetch_info->expired, + (long long) prefetch_info->duplicates); + } + else + { + ExplainPropertyInteger("Prefetch Hits", NULL, + prefetch_info->hits, es); + ExplainPropertyInteger("Prefetch Misses", NULL, + prefetch_info->misses, es); + ExplainPropertyInteger("Prefetch Expired Requests", NULL, + prefetch_info->expired, es); + ExplainPropertyInteger("Prefetch Duplicated Requests", NULL, + prefetch_info->duplicates, es); + } +} + /* * Show buffer usage details. */ diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index 2ff0d691d86..dec294de630 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -403,6 +403,7 @@ get_extension_script_directory(ExtensionControlFile *control) { char sharepath[MAXPGPATH]; char *result; + struct stat fst; /* * The directory parameter can be omitted, absolute, or relative to the @@ -418,6 +419,16 @@ get_extension_script_directory(ExtensionControlFile *control) result = (char *) palloc(MAXPGPATH); snprintf(result, MAXPGPATH, "%s/%s", sharepath, control->directory); + // If directory does not exist, check remote extension storage + if (stat(result, &fst) < 0) + { + // request download of extension files from for control->directory + if (download_extension_file_hook != NULL) + { + download_extension_file_hook(control->directory, false); + } + } + return result; } @@ -1508,6 +1519,13 @@ CreateExtensionInternal(char *extensionName, * will get us there. */ filename = get_extension_script_filename(pcontrol, NULL, versionName); + + // request download of extension files from compute_ctl + if (download_extension_file_hook != NULL) + { + download_extension_file_hook(extensionName, false); + } + if (stat(filename, &fst) == 0) { /* Easy, no extra scripts */ diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c index f4ba572697a..a618ac5c7b2 100644 --- a/src/backend/commands/publicationcmds.c +++ b/src/backend/commands/publicationcmds.c @@ -728,6 +728,13 @@ CheckPubRelationColumnList(char *pubname, List *tables, } } +static bool +is_neon_superuser(void) +{ + Oid neon_superuser_oid = get_role_oid("neon_superuser", true /*missing_ok*/); + return neon_superuser_oid != InvalidOid && has_privs_of_role(GetUserId(), neon_superuser_oid); +} + /* * Create new publication. */ @@ -755,7 +762,7 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt) get_database_name(MyDatabaseId)); /* FOR ALL TABLES requires superuser */ - if (stmt->for_all_tables && !superuser()) + if (stmt->for_all_tables && !superuser() && !is_neon_superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to create FOR ALL TABLES publication"))); @@ -826,7 +833,7 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt) &schemaidlist); /* FOR TABLES IN SCHEMA requires superuser */ - if (schemaidlist != NIL && !superuser()) + if (schemaidlist != NIL && !superuser() && !is_neon_superuser()) ereport(ERROR, errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to create FOR TABLES IN SCHEMA publication")); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index c7e262c0fcc..f14811e63bd 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -54,7 +54,9 @@ * so we pre-log a few fetches in advance. In the event of * crash we can lose (skip over) as many values as we pre-logged. */ -#define SEQ_LOG_VALS 32 +/* NEON XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ +/* #define SEQ_LOG_VALS 32 */ +#define SEQ_LOG_VALS 0 /* * The "special area" of a sequence's buffer page looks like this. @@ -355,7 +357,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) { SMgrRelation srel; - srel = smgropen(rel->rd_locator, InvalidBackendId); + srel = smgropen(rel->rd_locator, InvalidBackendId, rel->rd_rel->relpersistence); smgrcreate(srel, INIT_FORKNUM, false); log_smgrcreate(&rel->rd_locator, INIT_FORKNUM); fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 90a5238c4d4..3b9ff19bd8e 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14795,7 +14795,7 @@ index_copy_data(Relation rel, RelFileLocator newrlocator) { SMgrRelation dstrel; - dstrel = smgropen(newrlocator, rel->rd_backend); + dstrel = smgropen(newrlocator, rel->rd_backend, rel->rd_rel->relpersistence); /* * Since we copy the file directly without looking at the shared buffers, diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index ee78a5749d2..01978c79f5c 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -235,6 +235,10 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) dst->local_blks_written += add->local_blks_written; dst->temp_blks_read += add->temp_blks_read; dst->temp_blks_written += add->temp_blks_written; + dst->prefetch.hits += add->prefetch.hits; + dst->prefetch.misses += add->prefetch.misses; + dst->prefetch.expired += add->prefetch.expired; + dst->prefetch.duplicates += add->prefetch.duplicates; INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time); INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time); INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time); @@ -257,6 +261,10 @@ BufferUsageAccumDiff(BufferUsage *dst, dst->local_blks_written += add->local_blks_written - sub->local_blks_written; dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; + dst->prefetch.hits += add->prefetch.hits - sub->prefetch.hits; + dst->prefetch.misses += add->prefetch.misses - sub->prefetch.misses; + dst->prefetch.expired += add->prefetch.expired - sub->prefetch.expired; + dst->prefetch.duplicates += add->prefetch.duplicates - sub->prefetch.duplicates; INSTR_TIME_ACCUM_DIFF(dst->blk_read_time, add->blk_read_time, sub->blk_read_time); INSTR_TIME_ACCUM_DIFF(dst->blk_write_time, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index f35df0b8bfb..05c7244292f 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -149,37 +149,21 @@ BitmapHeapNext(BitmapHeapScanState *node) * multiple processes to iterate jointly. */ pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - pstate->prefetch_iterator = - tbm_prepare_shared_iterate(tbm); - - /* - * We don't need the mutex here as we haven't yet woke up - * others. - */ - pstate->prefetch_pages = 0; - pstate->prefetch_target = -1; - } -#endif /* We have initialized the shared state so wake up others. */ BitmapDoneInitializingSharedState(pstate); } +#ifdef USE_PREFETCH + node->prefetch_head = 0; + node->prefetch_pages = 0; + node->prefetch_target = -1; +#endif /* Allocate a private iterator and attach the shared state to it */ node->shared_tbmiterator = shared_tbmiterator = tbm_attach_shared_iterate(dsa, pstate->tbmiterator); node->tbmres = tbmres = NULL; -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - node->shared_prefetch_iterator = - tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator); - } -#endif /* USE_PREFETCH */ } node->initialized = true; } @@ -196,9 +180,25 @@ BitmapHeapNext(BitmapHeapScanState *node) if (tbmres == NULL) { if (!pstate) - node->tbmres = tbmres = tbm_iterate(tbmiterator); + tbmres = tbm_iterate(tbmiterator); else - node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); + { + if (node->prefetch_pages != 0) + { + tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_head]; + node->prefetch_pages -= 1; + node->prefetch_head = (node->prefetch_head + 1) % MAX_IO_CONCURRENCY; + } + else + tbmres = tbm_shared_iterate(shared_tbmiterator); + if (tbmres) + { + /* Need to copy result because iterator can be used for prefetch and vocant position in prefetch ring buffer can also be reused */ + memcpy(&node->tbmres_copy, tbmres, offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmres->ntuples, 0)); + tbmres = (TBMIterateResult *)&node->tbmres_copy; + } + } + node->tbmres = tbmres; if (tbmres == NULL) { /* no more entries in the bitmap */ @@ -237,7 +237,6 @@ BitmapHeapNext(BitmapHeapScanState *node) /* AM doesn't think this block is valid, skip */ continue; } - if (tbmres->ntuples >= 0) node->exact_pages++; else @@ -258,19 +257,8 @@ BitmapHeapNext(BitmapHeapScanState *node) * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ - if (!pstate) - { - if (node->prefetch_target < node->prefetch_maximum) - node->prefetch_target++; - } - else if (pstate->prefetch_target < node->prefetch_maximum) - { - /* take spinlock while updating shared state */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target < node->prefetch_maximum) - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } + if (node->prefetch_target < node->prefetch_maximum) + node->prefetch_target++; #endif /* USE_PREFETCH */ } @@ -361,54 +349,24 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, TBMIterateResult *tbmres) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; + TBMIterator *prefetch_iterator = node->prefetch_iterator; - if (pstate == NULL) - { - TBMIterator *prefetch_iterator = node->prefetch_iterator; - - if (node->prefetch_pages > 0) - { - /* The main iterator has closed the distance by one page */ - node->prefetch_pages--; - } - else if (prefetch_iterator) - { - /* Do not let the prefetch iterator get behind the main one */ - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - - if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) - elog(ERROR, "prefetch and main iterators are out of sync"); - } + /* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */ + if (node->pstate != NULL) return; - } - if (node->prefetch_maximum > 0) + if (node->prefetch_pages > 0) { - TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; - - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages > 0) - { - pstate->prefetch_pages--; - SpinLockRelease(&pstate->mutex); - } - else - { - /* Release the mutex before iterating */ - SpinLockRelease(&pstate->mutex); + /* The main iterator has closed the distance by one page */ + node->prefetch_pages--; + } + else if (prefetch_iterator) + { + /* Do not let the prefetch iterator get behind the main one */ + TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - /* - * In case of shared mode, we can not ensure that the current - * blockno of the main iterator and that of the prefetch iterator - * are same. It's possible that whatever blockno we are - * prefetching will be processed by another process. Therefore, - * we don't validate the blockno here as we do in non-parallel - * case. - */ - if (prefetch_iterator) - tbm_shared_iterate(prefetch_iterator); - } + if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) + elog(ERROR, "prefetch and main iterators are out of sync"); } #endif /* USE_PREFETCH */ } @@ -425,35 +383,14 @@ static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - - if (pstate == NULL) - { - if (node->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (node->prefetch_target >= node->prefetch_maximum / 2) - node->prefetch_target = node->prefetch_maximum; - else if (node->prefetch_target > 0) - node->prefetch_target *= 2; - else - node->prefetch_target++; - return; - } - - /* Do an unlocked check first to save spinlock acquisitions. */ - if (pstate->prefetch_target < node->prefetch_maximum) - { - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (pstate->prefetch_target >= node->prefetch_maximum / 2) - pstate->prefetch_target = node->prefetch_maximum; - else if (pstate->prefetch_target > 0) - pstate->prefetch_target *= 2; - else - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } + if (node->prefetch_target >= node->prefetch_maximum) + /* don't increase any further */ ; + else if (node->prefetch_target >= node->prefetch_maximum / 2) + node->prefetch_target = node->prefetch_maximum; + else if (node->prefetch_target > 0) + node->prefetch_target *= 2; + else + node->prefetch_target++; #endif /* USE_PREFETCH */ } @@ -507,56 +444,47 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } - - return; } - - if (pstate->prefetch_pages < pstate->prefetch_target) + else { - TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; - - if (prefetch_iterator) + while (1) { - while (1) - { - TBMIterateResult *tbmpre; - bool do_prefetch = false; - bool skip_fetch; + TBMIterateResult *tbmpre; + bool do_prefetch = false; + bool skip_fetch; - /* - * Recheck under the mutex. If some other process has already - * done enough prefetching then we need not to do anything. - */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages < pstate->prefetch_target) - { - pstate->prefetch_pages++; - do_prefetch = true; - } - SpinLockRelease(&pstate->mutex); + if (node->prefetch_pages < node->prefetch_target) + { + Assert(node->prefetch_pages < MAX_IO_CONCURRENCY); + do_prefetch = true; + } - if (!do_prefetch) - return; + if (!do_prefetch) + return; - tbmpre = tbm_shared_iterate(prefetch_iterator); - if (tbmpre == NULL) - { - /* No more pages to prefetch */ - tbm_end_shared_iterate(prefetch_iterator); - node->shared_prefetch_iterator = NULL; - break; - } + tbmpre = tbm_shared_iterate(node->shared_tbmiterator); + if (tbmpre != NULL) + { + memcpy(&node->prefetch_requests[(node->prefetch_head + node->prefetch_pages) % MAX_IO_CONCURRENCY], + tbmpre, + offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmpre->ntuples, 0)); + node->prefetch_pages += 1; + } + else + { + /* No more pages to prefetch */ + break; + } - /* As above, skip prefetch if we expect not to need page */ - skip_fetch = (node->can_skip_fetch && - (node->tbmres ? !node->tbmres->recheck : false) && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, - &node->pvmbuffer)); + /* As above, skip prefetch if we expect not to need page */ + skip_fetch = (node->can_skip_fetch && + !tbmpre->recheck && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmpre->blockno, + &node->pvmbuffer)); - if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); - } + if (!skip_fetch) + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } #endif /* USE_PREFETCH */ @@ -613,8 +541,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) tbm_end_iterate(node->prefetch_iterator); if (node->shared_tbmiterator) tbm_end_shared_iterate(node->shared_tbmiterator); - if (node->shared_prefetch_iterator) - tbm_end_shared_iterate(node->shared_prefetch_iterator); if (node->tbm) tbm_free(node->tbm); if (node->vmbuffer != InvalidBuffer) @@ -627,7 +553,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) node->prefetch_iterator = NULL; node->initialized = false; node->shared_tbmiterator = NULL; - node->shared_prefetch_iterator = NULL; node->vmbuffer = InvalidBuffer; node->pvmbuffer = InvalidBuffer; @@ -683,8 +608,6 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) tbm_free(node->tbm); if (node->shared_tbmiterator) tbm_end_shared_iterate(node->shared_tbmiterator); - if (node->shared_prefetch_iterator) - tbm_end_shared_iterate(node->shared_prefetch_iterator); if (node->vmbuffer != InvalidBuffer) ReleaseBuffer(node->vmbuffer); if (node->pvmbuffer != InvalidBuffer) @@ -739,7 +662,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->pscan_len = 0; scanstate->initialized = false; scanstate->shared_tbmiterator = NULL; - scanstate->shared_prefetch_iterator = NULL; scanstate->pstate = NULL; /* @@ -794,8 +716,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) * Maximum number of prefetches for the tablespace if configured, * otherwise the current value of the effective_io_concurrency GUC. */ - scanstate->prefetch_maximum = - get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); + scanstate->prefetch_maximum = get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); scanstate->ss.ss_currentRelation = currentRelation; diff --git a/src/backend/main/main.c b/src/backend/main/main.c index ed11e8be7fa..ebef569d1d8 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -32,6 +32,7 @@ #include "bootstrap/bootstrap.h" #include "common/username.h" +#include "miscadmin.h" #include "port/atomics.h" #include "postmaster/postmaster.h" #include "storage/spin.h" @@ -51,6 +52,41 @@ static void init_locale(const char *categoryname, int category, const char *loca static void help(const char *progname); static void check_root(const char *progname); +typedef int (*MainFunc) (int argc, char *argv[]); + +static int +CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[], bool load_config) +{ + MainFunc main_func; + + /* + * Perform just enough initialization that we can load external libraries + */ + InitStandaloneProcess(argv[0]); + + SetProcessingMode(InitProcessing); + + /* + * Set default values for command-line options. + */ + InitializeGUCOptions(); + + /* Acquire configuration parameters */ + if (load_config && !SelectConfigFiles(NULL, progname)) + exit(1); + + /* + * Imitate we are early in bootstrap loading shared_preload_libraries; + * neon extension sets PGC_POSTMASTER gucs requiring this. + */ + process_shared_preload_libraries_in_progress = true; + + main_func = load_external_function(library_name, main_func_name, true, NULL); + + process_shared_preload_libraries_in_progress = false; + + return main_func(argc, argv); +} /* * Any Postgres server process begins execution here. @@ -194,6 +230,10 @@ main(int argc, char *argv[]) else if (argc > 1 && strcmp(argv[1], "--single") == 0) PostgresSingleUserMain(argc, argv, strdup(get_user_name_or_exit(progname))); + else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0) + CallExtMain("neon_walredo", "WalRedoMain", argc, argv, false); + else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0) + CallExtMain("neon", "WalProposerSync", argc, argv, true); else PostmasterMain(argc, argv); /* the functions above should not return */ diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index ef475d95a18..964c05be9dc 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -153,6 +153,9 @@ bool enable_parallel_hash = true; bool enable_partition_pruning = true; bool enable_presorted_aggregate = true; bool enable_async_append = true; +bool enable_seqscan_prefetch = true; +bool enable_indexscan_prefetch = true; +bool enable_indexonlyscan_prefetch = true; typedef struct { diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index 41243d0187a..8f56bc804ef 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -1236,6 +1236,8 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, if (ctx->callbacks.message_cb == NULL) return; + if (strncmp(prefix, "neon-file", strlen("neon-file")) == 0) + return; /* Push callback + info on the error context stack */ state.ctx = ctx; @@ -1551,6 +1553,8 @@ stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, /* this callback is optional */ if (ctx->callbacks.stream_message_cb == NULL) return; + if (strncmp(prefix, "neon-file", strlen("neon-file")) == 0) + return; /* Push callback + info on the error context stack */ state.ctx = ctx; diff --git a/src/backend/replication/logical/message.c b/src/backend/replication/logical/message.c index c5de14afc65..b9470afc68d 100644 --- a/src/backend/replication/logical/message.c +++ b/src/backend/replication/logical/message.c @@ -31,6 +31,8 @@ #include "postgres.h" +#include + #include "access/xact.h" #include "access/xloginsert.h" #include "miscadmin.h" @@ -87,3 +89,51 @@ logicalmsg_redo(XLogReaderState *record) /* This is only interesting for logical decoding, see decode.c. */ } + +/* + * NEON: persist file in WAL to save it in persistent storage. + * If fd < 0, then remote entry from page server. + */ +void +wallog_file_descriptor(char const* path, int fd) +{ + char prefix[MAXPGPATH]; + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + if (fd < 0) + { + elog(DEBUG1, "neon: deleting contents of rewrite file %s", path); + /* unlink file */ + LogLogicalMessage(prefix, NULL, 0, false); + } + else + { + off_t size = lseek(fd, 0, SEEK_END); + char* buf; + elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, (long)size); + if (size < 0) + elog(ERROR, "Failed to get size of mapping file: %m"); + buf = palloc((size_t)size); + lseek(fd, 0, SEEK_SET); + if (read(fd, buf, (size_t)size) != size) + elog(ERROR, "Failed to read mapping file: %m"); + LogLogicalMessage(prefix, buf, (size_t)size, false); + pfree(buf); + } +} + +void +wallog_file(char const* path) +{ + int fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + } + else + { + wallog_file_descriptor(path, fd); + CloseTransientFile(fd); + } +} diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index b0255ffd25a..c24f93b08c2 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -83,6 +83,7 @@ #include "nodes/execnodes.h" #include "pgstat.h" #include "replication/logical.h" +#include "replication/message.h" #include "replication/origin.h" #include "storage/condition_variable.h" #include "storage/copydir.h" @@ -578,10 +579,14 @@ CheckPointReplicationOrigin(void) int i; uint32 magic = REPLICATION_STATE_MAGIC; pg_crc32c crc; + char *buf; + size_t chkp_size; if (max_replication_slots == 0) return; + buf = palloc(sizeof(magic) + max_replication_slots*sizeof(ReplicationStateOnDisk) + sizeof(crc)); + INIT_CRC32C(crc); /* make sure no old temp file is remaining */ @@ -615,6 +620,9 @@ CheckPointReplicationOrigin(void) errmsg("could not write to file \"%s\": %m", tmppath))); } + memcpy(buf, &magic, sizeof magic); + chkp_size = sizeof(magic); + COMP_CRC32C(crc, &magic, sizeof(magic)); /* prevent concurrent creations/drops */ @@ -657,6 +665,8 @@ CheckPointReplicationOrigin(void) errmsg("could not write to file \"%s\": %m", tmppath))); } + memcpy(buf + chkp_size, &disk_state, sizeof(disk_state)); + chkp_size += sizeof(disk_state); COMP_CRC32C(crc, &disk_state, sizeof(disk_state)); } @@ -676,6 +686,15 @@ CheckPointReplicationOrigin(void) errmsg("could not write to file \"%s\": %m", tmppath))); } + if (chkp_size != sizeof(magic)) /* has some valid origins */ + { + memcpy(buf + chkp_size, &crc, sizeof crc); + chkp_size += sizeof(crc); + + /* NEON specific: persist snapshot in storage using logical message */ + LogLogicalMessage("neon-file:pg_logical/replorigin_checkpoint", buf, chkp_size, false); + } + pfree(buf); if (CloseTransientFile(tmpfd) != 0) ereport(PANIC, diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 7a7aba33e16..488e24b2b75 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -127,6 +127,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "replication/logical.h" +#include "replication/message.h" #include "replication/reorderbuffer.h" #include "replication/snapbuild.h" #include "storage/block.h" /* debugging output */ @@ -1609,6 +1610,7 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) int fd; char tmppath[MAXPGPATH]; char path[MAXPGPATH]; + char prefix[MAXPGPATH]; int ret; struct stat stat_buf; Size sz; @@ -1752,6 +1754,10 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", tmppath))); + /* NEON specific: persist snapshot in storage using logical message */ + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + LogLogicalMessage(prefix, (char*)ondisk, needed_length, false); + errno = 0; pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE); if ((write(fd, ondisk, needed_length)) != needed_length) @@ -2054,6 +2060,7 @@ CheckPointSnapBuild(void) DIR *snap_dir; struct dirent *snap_de; char path[MAXPGPATH + 21]; + char prefix[MAXPGPATH + 31]; /* * We start off with a minimum of the last redo pointer. No new @@ -2113,6 +2120,10 @@ CheckPointSnapBuild(void) { elog(DEBUG1, "removing snapbuild snapshot %s", path); + /* NEON specific: delete file from storage using logical message */ + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + LogLogicalMessage(prefix, NULL, 0, false); + /* * It's not particularly harmful, though strange, if we can't * remove the file here. Don't prevent the checkpoint from diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index bb09c4010f8..7934d51f3c6 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -47,6 +47,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "replication/slot.h" +#include "replication/message.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/proc.h" @@ -685,6 +686,15 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); + if (SlotIsLogical(slot)) + { + /* NEON specific: delete slot from storage using logical message */ + char prefix[MAXPGPATH]; + snprintf(prefix, sizeof(prefix), "neon-file:%s/state", path); + elog(LOG, "Drop replication slot %s", path); + XLogFlush(LogLogicalMessage(prefix, NULL, 0, false)); + } + /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the @@ -1795,6 +1805,15 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) ReplicationSlotOnDiskChecksummedSize); FIN_CRC32C(cp.checksum); + if (SlotIsLogical(slot) && cp.slotdata.restart_lsn != InvalidXLogRecPtr) + { + /* NEON specific: persist slot in storage using logical message */ + char prefix[MAXPGPATH]; + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + elog(LOG, "Save replication slot at %s restart_lsn=%X/%X", path, LSN_FORMAT_ARGS(cp.slotdata.restart_lsn)); + XLogFlush(LogLogicalMessage(prefix, (char*)&cp, sizeof cp, false)); + } + errno = 0; pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index feff7094351..63cd3d44d77 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -523,6 +523,13 @@ WalReceiverMain(void) if (endofwal) break; + /* + * Update WAL statistics, which are produced inside + * issue_xlog_fsync function. This is useful for counting + * WAL flushes, by querying pg_stat_wal. + */ + pgstat_report_wal(true); + /* Find the soonest wakeup time, to limit our nap. */ nextWakeup = TIMESTAMP_INFINITY; for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 4c53de08b9b..ebb9ddc9b9b 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -54,6 +54,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xloginsert.h" #include "access/xlogreader.h" #include "access/xlogrecovery.h" #include "access/xlogutils.h" @@ -130,6 +131,11 @@ bool log_replication_commands = false; */ bool wake_wal_senders = false; +/* + * Backpressure hook, detecting how much we should delay. + */ +uint64 (*delay_backend_us)(void) = NULL; + /* * xlogreader used for replication. Note that a WAL sender doing physical * replication does not need xlogreader to read WAL, but it needs one to @@ -253,8 +259,6 @@ static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, Transac static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool skipped_xact); static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); -static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); -static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch); static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, @@ -2031,7 +2035,7 @@ ProcessStandbyMessage(void) /* * Remember that a walreceiver just confirmed receipt of lsn `lsn`. */ -static void +void PhysicalConfirmReceivedLocation(XLogRecPtr lsn) { bool changed = false; @@ -2070,21 +2074,34 @@ ProcessStandbyReplyMessage(void) flushPtr, applyPtr; bool replyRequested; - TimeOffset writeLag, - flushLag, - applyLag; - bool clearLagTimes; - TimestampTz now; TimestampTz replyTime; - static bool fullyAppliedLastTime = false; - /* the caller already consumed the msgtype byte */ writePtr = pq_getmsgint64(&reply_message); flushPtr = pq_getmsgint64(&reply_message); applyPtr = pq_getmsgint64(&reply_message); replyTime = pq_getmsgint64(&reply_message); replyRequested = pq_getmsgbyte(&reply_message); + ProcessStandbyReply(writePtr, + flushPtr, + applyPtr, + replyTime, + replyRequested); +} + +void +ProcessStandbyReply(XLogRecPtr writePtr, + XLogRecPtr flushPtr, + XLogRecPtr applyPtr, + TimestampTz replyTime, + bool replyRequested) +{ + TimeOffset writeLag, + flushLag, + applyLag; + bool clearLagTimes; + TimestampTz now; + static bool fullyAppliedLastTime = false; if (message_level_is_interesting(DEBUG2)) { @@ -2267,7 +2284,16 @@ ProcessStandbyHSFeedbackMessage(void) feedbackEpoch = pq_getmsgint(&reply_message, 4); feedbackCatalogXmin = pq_getmsgint(&reply_message, 4); feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4); + ProcessStandbyHSFeedback(replyTime, feedbackXmin, feedbackEpoch, feedbackCatalogXmin, feedbackCatalogEpoch); +} +void +ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch) +{ if (message_level_is_interesting(DEBUG2)) { char *replyTimeStr; @@ -2952,9 +2978,12 @@ XLogSendPhysical(void) /* * OK to read and send the slice. */ - resetStringInfo(&output_message); - pq_sendbyte(&output_message, 'w'); + if (output_message.data) + resetStringInfo(&output_message); + else + initStringInfo(&output_message); + pq_sendbyte(&output_message, 'w'); pq_sendint64(&output_message, startptr); /* dataStart */ pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ pq_sendint64(&output_message, 0); /* sendtime, filled in last */ @@ -3140,8 +3169,8 @@ WalSndDone(WalSndSendDataCallback send_data) * flush location if valid, write otherwise. Tools like pg_receivewal will * usually (unless in synchronous mode) return an invalid flush location. */ - replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ? - MyWalSnd->write : MyWalSnd->flush; + // XXX Zenith uses flush_lsn to pass extra payload, so use write_lsn here + replicatedPtr = MyWalSnd->write; if (WalSndCaughtUp && sentPtr == replicatedPtr && !pq_is_send_pending()) @@ -3753,7 +3782,7 @@ WalSndKeepaliveIfNecessary(void) * eventually reported to have been written, flushed and applied by the * standby in a reply message. */ -static void +void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) { bool buffer_full; @@ -3818,7 +3847,7 @@ LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) * Return -1 if no new sample data is available, and otherwise the elapsed * time in microseconds. */ -static TimeOffset +TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now) { TimestampTz time = 0; diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 0057443f0c6..f2b4c0e4945 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -24,6 +24,14 @@ ConditionVariableMinimallyPadded *BufferIOCVArray; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; +/* + * Buffer with target WAL redo page. + * We must not evict this page from the buffer pool, but we cannot just keep it pinned because + * some WAL redo functions expect the page to not be pinned. So we have a special check in + * localbuf.c to prevent this buffer from being evicted. + */ +Buffer wal_redo_buffer; +bool am_wal_redo_postgres = false; /* * Data Structures: diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e066a3f888f..4ecfb5db8c2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -52,6 +52,7 @@ #include "storage/proc.h" #include "storage/smgr.h" #include "storage/standby.h" +#include "replication/walsender.h" #include "utils/memdebug.h" #include "utils/ps_status.h" #include "utils/rel.h" @@ -160,6 +161,10 @@ int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER; int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER; int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER; +/* Evict unpinned pages (for better test coverage) */ +bool zenith_test_evict = false; + + /* local state for LockBufferForCleanup */ static BufferDesc *PinCountWaitBuf = NULL; @@ -798,7 +803,8 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, { bool hit; - SMgrRelation smgr = smgropen(rlocator, InvalidBackendId); + SMgrRelation smgr = smgropen(rlocator, InvalidBackendId, + RELPERSISTENCE_PERMANENT); return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, forkNum, blockNum, @@ -998,7 +1004,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, bool found; IOContext io_context; IOObject io_object; - bool isLocalBuf = SmgrIsTemp(smgr); + /* + * wal_redo postgres is working in single user mode, we do not need to + * synchronize access to shared buffer, so let's use local buffers instead. + */ + bool isLocalBuf = SmgrIsTemp(smgr) || am_wal_redo_postgres; *hit = false; @@ -1993,6 +2003,10 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, buffers[i] = BufferDescriptorGetBuffer(existing_hdr); buf_block = BufHdrGetBlock(existing_hdr); + /* + * NEON: In earlier Neon PostgreSQL versions, we zeroed the pages + * and DEBUG1-logged this instead. + */ if (valid && !PageIsNew((Page) buf_block)) ereport(ERROR, (errmsg("unexpected data beyond EOF in block %u of relation %s", @@ -2455,6 +2469,32 @@ UnpinBuffer(BufferDesc *buf) UnlockBufHdr(buf, buf_state); } ForgetPrivateRefCountEntry(ref); + + if (zenith_test_evict && !InRecovery) + { + buf_state = LockBufHdr(buf); + if (BUF_STATE_GET_REFCOUNT(buf_state) == 0) + { + if (buf_state & BM_DIRTY) + { + ReservePrivateRefCountEntry(); + PinBuffer_Locked(buf); + if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_SHARED)) + { + FlushOneBuffer(b); + LWLockRelease(BufferDescriptorGetContentLock(buf)); + } + UnpinBuffer(buf); + } + else + { + InvalidateBuffer(buf); + } + } + else + UnlockBufHdr(buf, buf_state); + } } } @@ -3373,7 +3413,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, /* Find smgr relation for buffer */ if (reln == NULL) - reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId); + reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId, 0); TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag), buf->tag.blockNum, @@ -4277,7 +4317,8 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM); /* Get number of blocks in the source relation. */ - nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId), + nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId, + permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED), forkNum); /* Nothing to copy; just return. */ @@ -4289,8 +4330,9 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, * relation before starting to copy block by block. */ memset(buf.data, 0, BLCKSZ); - smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1, - buf.data, true); + smgrextend(smgropen(dstlocator, InvalidBackendId, + permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED), + forkNum, nblocks - 1, buf.data, true); /* This is a bulk operation, so use buffer access strategies. */ bstrategy_src = GetAccessStrategy(BAS_BULKREAD); @@ -4371,9 +4413,9 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator, for (ForkNumber forkNum = MAIN_FORKNUM + 1; forkNum <= MAX_FORKNUM; forkNum++) { - if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum)) + if (smgrexists(smgropen(src_rlocator, InvalidBackendId, relpersistence), forkNum)) { - smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false); + smgrcreate(smgropen(dst_rlocator, InvalidBackendId, relpersistence), forkNum, false); /* * WAL log creation if the relation is persistent, or this is the @@ -5562,7 +5604,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context) i += ahead; /* and finally tell the kernel to write the data to storage */ - reln = smgropen(currlocator, InvalidBackendId); + reln = smgropen(currlocator, InvalidBackendId, 0); smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks); } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 55953c3e3e8..b6247ea7bc4 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -18,6 +18,7 @@ #include "access/parallel.h" #include "catalog/catalog.h" #include "executor/instrument.h" +#include "miscadmin.h" #include "pgstat.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" @@ -25,6 +26,8 @@ #include "utils/memutils.h" #include "utils/resowner_private.h" +/* NEON: prevent eviction of the buffer of target page */ +extern Buffer wal_redo_buffer; /*#define LBDEBUG*/ @@ -198,6 +201,12 @@ GetLocalVictimBuffer(void) if (LocalRefCount[victim_bufid] == 0) { + if (-victim_bufid - 1 == wal_redo_buffer) + { + /* ZENITH: Prevent eviction of the buffer with target wal redo page */ + continue; + } + buf_state = pg_atomic_read_u32(&bufHdr->state); if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0) @@ -239,7 +248,10 @@ GetLocalVictimBuffer(void) Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); /* Find smgr relation for buffer */ - oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId); + if (am_wal_redo_postgres && MyBackendId == InvalidBackendId) + oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId, RELPERSISTENCE_PERMANENT); + else + oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId, RELPERSISTENCE_TEMP); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c2956..b4652c33ff6 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +LastWrittenLsnLock 48 diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index fdecbad1709..cad698ec65d 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1257,7 +1257,7 @@ DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) srels = palloc(sizeof(SMgrRelation) * ndelrels); for (i = 0; i < ndelrels; i++) { - SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); + SMgrRelation srel = smgropen(delrels[i], InvalidBackendId, 0); if (isRedo) { @@ -1541,7 +1541,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) int mdsyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId); + SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId, 0); File file; instr_time io_start; bool need_to_close; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 5d0f3d515c3..69eb94587c6 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -18,6 +18,7 @@ #include "postgres.h" #include "access/xlogutils.h" +#include "catalog/pg_tablespace.h" #include "lib/ilist.h" #include "storage/bufmgr.h" #include "storage/fd.h" @@ -28,69 +29,26 @@ #include "utils/inval.h" -/* - * This struct of function pointers defines the API between smgr.c and - * any individual storage manager module. Note that smgr subfunctions are - * generally expected to report problems via elog(ERROR). An exception is - * that smgr_unlink should use elog(WARNING), rather than erroring out, - * because we normally unlink relations during post-commit/abort cleanup, - * and so it's too late to raise an error. Also, various conditions that - * would normally be errors should be allowed during bootstrap and/or WAL - * recovery --- see comments in md.c for details. - */ -typedef struct f_smgr -{ - void (*smgr_init) (void); /* may be NULL */ - void (*smgr_shutdown) (void); /* may be NULL */ - void (*smgr_open) (SMgrRelation reln); - void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, - bool isRedo); - bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, - bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); - void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); - bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); - void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, void *buffer); - void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); - void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); - BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); - void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); -} f_smgr; - -static const f_smgr smgrsw[] = { +static const f_smgr smgr_md = { /* magnetic disk */ - { - .smgr_init = mdinit, - .smgr_shutdown = NULL, - .smgr_open = mdopen, - .smgr_close = mdclose, - .smgr_create = mdcreate, - .smgr_exists = mdexists, - .smgr_unlink = mdunlink, - .smgr_extend = mdextend, - .smgr_zeroextend = mdzeroextend, - .smgr_prefetch = mdprefetch, - .smgr_read = mdread, - .smgr_write = mdwrite, - .smgr_writeback = mdwriteback, - .smgr_nblocks = mdnblocks, - .smgr_truncate = mdtruncate, - .smgr_immedsync = mdimmedsync, - } + .smgr_init = mdinit, + .smgr_shutdown = NULL, + .smgr_open = mdopen, + .smgr_close = mdclose, + .smgr_create = mdcreate, + .smgr_exists = mdexists, + .smgr_unlink = mdunlink, + .smgr_extend = mdextend, + .smgr_zeroextend = mdzeroextend, + .smgr_prefetch = mdprefetch, + .smgr_read = mdread, + .smgr_write = mdwrite, + .smgr_writeback = mdwriteback, + .smgr_nblocks = mdnblocks, + .smgr_truncate = mdtruncate, + .smgr_immedsync = mdimmedsync, }; -static const int NSmgr = lengthof(smgrsw); - /* * Each backend has a hashtable that stores all extant SMgrRelation objects. * In addition, "unowned" SMgrRelation objects are chained together in a list. @@ -100,7 +58,7 @@ static HTAB *SMgrRelationHash = NULL; static dlist_head unowned_relns; /* local function prototypes */ -static void smgrshutdown(int code, Datum arg); +/* static void smgrshutdown(int code, Datum arg); */ /* @@ -114,40 +72,73 @@ static void smgrshutdown(int code, Datum arg); void smgrinit(void) { - int i; + (*smgr_init_hook)(); +} - for (i = 0; i < NSmgr; i++) - { - if (smgrsw[i].smgr_init) - smgrsw[i].smgr_init(); - } +/* Hook for plugins to get control in smgr */ +smgr_hook_type smgr_hook = NULL; +smgr_init_hook_type smgr_init_hook = smgr_init_standard; +smgr_shutdown_hook_type smgr_shutdown_hook = NULL; - /* register the shutdown proc */ - on_proc_exit(smgrshutdown, 0); +const f_smgr * +smgr_standard(BackendId backend, RelFileLocator rlocator) +{ + return &smgr_md; } -/* - * on_proc_exit hook for smgr cleanup during backend shutdown - */ -static void -smgrshutdown(int code, Datum arg) +///* +// * TODO: NEON v15- REMOVED this? +// * on_proc_exit hook for smgr cleanup during backend shutdown +// */ +//static void +//smgrshutdown(int code, Datum arg) +//{ +// int i; +// +// for (i = 0; i < NSmgr; i++) +// { +// if (smgrsw[i].smgr_shutdown) +// smgrsw[i].smgr_shutdown(); +// } +//} + +void +smgr_init_standard(void) { - int i; + mdinit(); +} - for (i = 0; i < NSmgr; i++) - { - if (smgrsw[i].smgr_shutdown) - smgrsw[i].smgr_shutdown(); - } +void +smgr_shutdown_standard(void) +{ + /* no-op */ +} + +const f_smgr * +smgr(BackendId backend, RelFileLocator rlocator) +{ + const f_smgr *result; + + if (smgr_hook) + result = (*smgr_hook)(backend, rlocator); + else + result = smgr_standard(backend, rlocator); + + return result; } /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * * This does not attempt to actually open the underlying file. + * + * The caller should pass the value of pg_class.relpersistence, if they know + * it, or 0 if unknown. Some operations, like smgrwrite() and smgrunlink() + * are allowed when relpersistence is not known, but others like smgrread() + * require it. */ SMgrRelation -smgropen(RelFileLocator rlocator, BackendId backend) +smgropen(RelFileLocator rlocator, BackendId backend, char relpersistence) { RelFileLocatorBackend brlocator; SMgrRelation reln; @@ -178,16 +169,34 @@ smgropen(RelFileLocator rlocator, BackendId backend) /* hash_search already filled in the lookup key */ reln->smgr_owner = NULL; reln->smgr_targblock = InvalidBlockNumber; + reln->smgr_relpersistence = relpersistence; + for (int i = 0; i <= MAX_FORKNUM; ++i) reln->smgr_cached_nblocks[i] = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + + reln->smgr = smgr(backend, rlocator); /* implementation-specific initialization */ - smgrsw[reln->smgr_which].smgr_open(reln); + (*reln->smgr).smgr_open(reln); /* it has no owner yet */ dlist_push_tail(&unowned_relns, &reln->node); } + else + { + /* + * If the caller passed a valid 'relpersistence', and it was unknown + * before, update it. + */ + if (reln->smgr_relpersistence == 0) + reln->smgr_relpersistence = relpersistence; + else + { + if (!(relpersistence == 0 || reln->smgr_relpersistence == relpersistence)) + elog(ERROR, "relpersistence mismatch: smgropen %c vs SmgrRelation %c", + relpersistence, reln->smgr_relpersistence); + } + } return reln; } @@ -250,7 +259,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln) bool smgrexists(SMgrRelation reln, ForkNumber forknum) { - return smgrsw[reln->smgr_which].smgr_exists(reln, forknum); + return (*reln->smgr).smgr_exists(reln, forknum); } /* @@ -263,7 +272,7 @@ smgrclose(SMgrRelation reln) ForkNumber forknum; for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + (*reln->smgr).smgr_close(reln, forknum); owner = reln->smgr_owner; @@ -293,7 +302,7 @@ smgrrelease(SMgrRelation reln) { for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + (*reln->smgr).smgr_close(reln, forknum); reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; } reln->smgr_targblock = InvalidBlockNumber; @@ -373,7 +382,7 @@ smgrcloserellocator(RelFileLocatorBackend rlocator) void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) { - smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo); + (*reln->smgr).smgr_create(reln, forknum, isRedo); } /* @@ -401,12 +410,10 @@ smgrdosyncall(SMgrRelation *rels, int nrels) */ for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - if (smgrsw[which].smgr_exists(rels[i], forknum)) - smgrsw[which].smgr_immedsync(rels[i], forknum); + if ((*rels[i]->smgr).smgr_exists(rels[i], forknum)) + (*rels[i]->smgr).smgr_immedsync(rels[i], forknum); } } } @@ -445,13 +452,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator; - int which = rels[i]->smgr_which; rlocators[i] = rlocator; /* Close the forks at smgr level */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_close(rels[i], forknum); + (*rels[i]->smgr).smgr_close(rels[i], forknum); } /* @@ -475,10 +481,8 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo); + (*rels[i]->smgr).smgr_unlink(rlocators[i], forknum, isRedo); } pfree(rlocators); @@ -498,8 +502,7 @@ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, - buffer, skipFsync); + (*reln->smgr).smgr_extend(reln, forknum, blocknum, buffer, skipFsync); /* * Normally we expect this to increase nblocks by one, but if the cached @@ -523,8 +526,7 @@ void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum, - nblocks, skipFsync); + (*reln->smgr).smgr_zeroextend(reln, forknum, blocknum, nblocks, skipFsync); /* * Normally we expect this to increase the fork size by nblocks, but if @@ -547,7 +549,7 @@ smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); + return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum); } /* @@ -562,7 +564,7 @@ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer) { - smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); + (*reln->smgr).smgr_read(reln, forknum, blocknum, buffer); } /* @@ -584,8 +586,8 @@ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, - buffer, skipFsync); + (*reln->smgr).smgr_write(reln, forknum, blocknum, + buffer, skipFsync); } @@ -597,8 +599,7 @@ void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { - smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum, - nblocks); + (*reln->smgr).smgr_writeback(reln, forknum, blocknum, nblocks); } /* @@ -615,7 +616,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) if (result != InvalidBlockNumber) return result; - result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum); + result = (*reln->smgr).smgr_nblocks(reln, forknum); reln->smgr_cached_nblocks[forknum] = result; @@ -681,7 +682,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb /* Make the cached size is invalid if we encounter an error. */ reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + (*reln->smgr).smgr_truncate(reln, forknum[i], nblocks[i]); /* * We might as well update the local smgr_cached_nblocks values. The @@ -720,9 +721,50 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb void smgrimmedsync(SMgrRelation reln, ForkNumber forknum) { - smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); + (*reln->smgr).smgr_immedsync(reln, forknum); +} + +/* + * Neon-added functions to mark the phases of an unlogged index build. + */ +void +smgr_start_unlogged_build(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_start_unlogged_build) + (*reln->smgr).smgr_start_unlogged_build(reln); +} + +void +smgr_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_finish_unlogged_build_phase_1) + (*reln->smgr).smgr_finish_unlogged_build_phase_1(reln); +} + +void +smgr_end_unlogged_build(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_end_unlogged_build) + (*reln->smgr).smgr_end_unlogged_build(reln); } +/* + * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer + * to download them on demand to reduce startup time. + * If SLRU segment is not found, we try to download it from page server + * + * This function returns number of blocks in segment. Usually it should be SLRU_PAGES_PER_SEGMENT but in case + * of partial segment, it can be smaller. Zero value means that segment doesn't exist. + * From Postgres point of view empty segment is the same as absent segment. + */ +int +smgr_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) +{ + return (*reln->smgr).smgr_read_slru_segment ? (*reln->smgr).smgr_read_slru_segment(reln, path, segno, buffer) : 0; +} + + + /* * AtEOXact_SMgr * diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 36cc99ec9cf..a0c3474f291 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -169,6 +169,8 @@ static ProcSignalReason RecoveryConflictReason; static MemoryContext row_description_context = NULL; static StringInfoData row_description_buf; +process_interrupts_callback_t ProcessInterruptsCallback; + /* ---------------------------------------------------------------- * decls for routines only used in this file * ---------------------------------------------------------------- @@ -3200,6 +3202,7 @@ ProcessInterrupts(void) return; InterruptPending = false; +retry: if (ProcDiePending) { ProcDiePending = false; @@ -3447,6 +3450,13 @@ ProcessInterrupts(void) if (ParallelApplyMessagePending) HandleParallelApplyMessages(); + + /* Call registered callback if any */ + if (ProcessInterruptsCallback) + { + if (ProcessInterruptsCallback()) + goto retry; + } } /* diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index a220cb876d8..e25419e116d 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -98,6 +98,7 @@ #include "lib/dshash.h" #include "pgstat.h" #include "port/atomics.h" +#include "replication/message.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lwlock.h" @@ -1464,7 +1465,8 @@ pgstat_write_statsfile(void) errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", tmpfile, statfile))); unlink(tmpfile); - } + } else if (XLogInsertAllowed()) + wallog_file(statfile); } /* helpers for pgstat_read_statsfile() */ diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 05b71f9bc2a..2f5b18fe95e 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -515,6 +515,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w) case WAIT_EVENT_VACUUM_TRUNCATE: event_name = "VacuumTruncate"; break; + case WAIT_EVENT_BACK_PRESSURE: + event_name = "BackPressure"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index e5c0f1c45b6..39f7107ed9f 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -24,6 +24,7 @@ #include "commands/tablespace.h" #include "miscadmin.h" #include "storage/fd.h" +#include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/numeric.h" @@ -112,6 +113,8 @@ db_dir_size(const char *path) return dirsize; } +dbsize_hook_type dbsize_hook = NULL; + /* * calculate size of database in all tablespaces */ @@ -141,6 +144,13 @@ calculate_database_size(Oid dbOid) /* Include pg_default storage */ snprintf(pathname, sizeof(pathname), "base/%u", dbOid); + + if (dbsize_hook) + { + totalsize = (*dbsize_hook)(dbOid); + return totalsize; + } + totalsize = db_dir_size(pathname); /* Scan the non-default tablespaces */ @@ -306,41 +316,18 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS) * is no check here or at the call sites for that. */ static int64 -calculate_relation_size(RelFileLocator *rfn, BackendId backend, ForkNumber forknum) +calculate_relation_size(RelFileLocator *rfn, BackendId backend, + ForkNumber forknum, char relpersistence) { - int64 totalsize = 0; - char *relationpath; - char pathname[MAXPGPATH]; - unsigned int segcount = 0; + SMgrRelation srel = smgropen(*rfn, backend, relpersistence); - relationpath = relpathbackend(*rfn, backend, forknum); - - for (segcount = 0;; segcount++) + if (smgrexists(srel, forknum)) { - struct stat fst; - - CHECK_FOR_INTERRUPTS(); - - if (segcount == 0) - snprintf(pathname, MAXPGPATH, "%s", - relationpath); - else - snprintf(pathname, MAXPGPATH, "%s.%u", - relationpath, segcount); - - if (stat(pathname, &fst) < 0) - { - if (errno == ENOENT) - break; - else - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", pathname))); - } - totalsize += fst.st_size; + BlockNumber n = smgrnblocks(srel, forknum); + return (int64) n * BLCKSZ; } - return totalsize; + return 0; } Datum @@ -364,7 +351,8 @@ pg_relation_size(PG_FUNCTION_ARGS) PG_RETURN_NULL(); size = calculate_relation_size(&(rel->rd_locator), rel->rd_backend, - forkname_to_number(text_to_cstring(forkName))); + forkname_to_number(text_to_cstring(forkName)), + rel->rd_rel->relpersistence); relation_close(rel, AccessShareLock); @@ -389,7 +377,8 @@ calculate_toast_table_size(Oid toastrelid) /* toast heap size, including FSM and VM size */ for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(toastRel->rd_locator), - toastRel->rd_backend, forkNum); + toastRel->rd_backend, forkNum, + toastRel->rd_rel->relpersistence); /* toast index size, including FSM and VM size */ indexlist = RelationGetIndexList(toastRel); @@ -403,7 +392,8 @@ calculate_toast_table_size(Oid toastrelid) AccessShareLock); for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(toastIdxRel->rd_locator), - toastIdxRel->rd_backend, forkNum); + toastIdxRel->rd_backend, forkNum, + toastIdxRel->rd_rel->relpersistence); relation_close(toastIdxRel, AccessShareLock); } @@ -432,7 +422,7 @@ calculate_table_size(Relation rel) */ for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(rel->rd_locator), rel->rd_backend, - forkNum); + forkNum, rel->rd_rel->relpersistence); /* * Size of toast relation @@ -472,7 +462,8 @@ calculate_indexes_size(Relation rel) for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(idxRel->rd_locator), idxRel->rd_backend, - forkNum); + forkNum, + idxRel->rd_rel->relpersistence); relation_close(idxRel, AccessShareLock); } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 8e08ca1c680..6e49533885b 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3783,7 +3783,7 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) * fails at this stage, the new cluster will need to be recreated * anyway. */ - srel = smgropen(relation->rd_locator, relation->rd_backend); + srel = smgropen(relation->rd_locator, relation->rd_backend, persistence); smgrdounlinkall(&srel, 1, false); smgrclose(srel); } diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c index b85d52c913c..2fdfa90e28d 100644 --- a/src/backend/utils/fmgr/dfmgr.c +++ b/src/backend/utils/fmgr/dfmgr.c @@ -36,6 +36,7 @@ #include "storage/shmem.h" #include "utils/hsearch.h" +download_extension_file_hook_type download_extension_file_hook = NULL; /* signature for PostgreSQL-specific library init function */ typedef void (*PG_init_t) (void); @@ -79,11 +80,13 @@ static void *internal_load_library(const char *libname); static void incompatible_module_error(const char *libname, const Pg_magic_struct *module_magic_data) pg_attribute_noreturn(); static bool file_exists(const char *name); -static char *expand_dynamic_library_name(const char *name); +static char *expand_dynamic_library_name(const char *name, bool *is_found); static void check_restricted_library_name(const char *name); static char *substitute_libpath_macro(const char *name); static char *find_in_dynamic_libpath(const char *basename); +static void neon_try_load(const char *name); + /* Magic structure that module needs to match to be accepted */ static const Pg_magic_struct magic_data = PG_MODULE_MAGIC_DATA; @@ -108,9 +111,20 @@ load_external_function(const char *filename, const char *funcname, char *fullname; void *lib_handle; void *retval; + bool is_found = true; /* Expand the possibly-abbreviated filename to an exact path name */ - fullname = expand_dynamic_library_name(filename); + fullname = expand_dynamic_library_name(filename, &is_found); + + // if file is not found, try to download it from compute_ctl + if (!is_found && download_extension_file_hook != NULL) + { + // try to download the file + elog(DEBUG3, "load_external_function: try to download file: %s", fullname); + neon_try_load(fullname); + // try to find file locally once again + fullname = expand_dynamic_library_name(filename, &is_found); + } /* Load the shared library, unless we already did */ lib_handle = internal_load_library(fullname); @@ -132,6 +146,47 @@ load_external_function(const char *filename, const char *funcname, return retval; } +void +neon_try_load(const char *name) +{ + bool have_slash; + char *request_name; + + // add .so suffix if it is not present + if (strstr(name, DLSUFFIX) == NULL) + { + request_name = psprintf("%s%s", name, DLSUFFIX); + elog(DEBUG3, "neon_try_load: add DLSUFFIX: %s", request_name); + } + else + { + request_name = pstrdup(name); + elog(DEBUG3, "neon_try_load: DLSUFFIX already present: %s", request_name); + } + + have_slash = (first_dir_separator(request_name) != NULL); + + if (strncmp(request_name, "$libdir/", strlen("$libdir/")) == 0) + { + char *new_request_name = psprintf("%s", request_name + strlen("$libdir/")); + pfree(request_name); + request_name = new_request_name; + + elog(DEBUG3, "neon_try_load: omit $libdir/: %s", request_name); + } + else if (have_slash) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("unexpected path in dynamic library name: %s", + name))); + } + + elog(DEBUG3, "neon_try_load: final request_name: %s", request_name); + + download_extension_file_hook(request_name, true); +} + /* * This function loads a shlib file without looking up any particular * function in it. If the same shlib has previously been loaded, @@ -144,13 +199,24 @@ void load_file(const char *filename, bool restricted) { char *fullname; + bool is_found = true; /* Apply security restriction if requested */ if (restricted) check_restricted_library_name(filename); /* Expand the possibly-abbreviated filename to an exact path name */ - fullname = expand_dynamic_library_name(filename); + fullname = expand_dynamic_library_name(filename, &is_found); + + // if file is not found, try to download it from compute_ctl + if (!is_found && download_extension_file_hook != NULL) + { + // try to download the file + elog(DEBUG3, "load_file: try to download file: %s", fullname); + neon_try_load(fullname); + // try to find file locally once again + fullname = expand_dynamic_library_name(filename, &is_found); + } /* Load the shared library */ (void) internal_load_library(fullname); @@ -168,7 +234,6 @@ lookup_external_function(void *filehandle, const char *funcname) return dlsym(filehandle, funcname); } - /* * Load the specified dynamic-link library file, unless it already is * loaded. Return the pg_dl* handle for the file. @@ -209,6 +274,7 @@ internal_load_library(const char *libname) errmsg("could not access file \"%s\": %m", libname))); + for (file_scanner = file_list; file_scanner != NULL && !SAME_INODE(stat_buf, *file_scanner); @@ -428,7 +494,7 @@ file_exists(const char *name) * The result will always be freshly palloc'd. */ static char * -expand_dynamic_library_name(const char *name) +expand_dynamic_library_name(const char *name, bool *is_found) { bool have_slash; char *new; @@ -474,9 +540,11 @@ expand_dynamic_library_name(const char *name) * If we can't find the file, just return the string as-is. The ensuing * load attempt will fail and report a suitable message. */ + *is_found = false; return pstrdup(name); } + /* * Check a restricted library name. It must begin with "$libdir/plugins/" * and there must not be any directory separators after that (this is diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index b078b934a79..5c4fea511c8 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -31,6 +31,7 @@ #include "access/toast_compression.h" #include "access/twophase.h" #include "access/xlog_internal.h" +#include "access/xloginsert.h" #include "access/xlogprefetcher.h" #include "access/xlogrecovery.h" #include "archive/archive_module.h" @@ -69,6 +70,7 @@ #include "storage/large_object.h" #include "storage/pg_shmem.h" #include "storage/predicate.h" +#include "storage/smgr.h" #include "storage/standby.h" #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" @@ -800,6 +802,36 @@ StaticAssertDecl(lengthof(config_type_names) == (PGC_ENUM + 1), struct config_bool ConfigureNamesBool[] = { + { + {"enable_seqscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of next pages in sequential scans."), + NULL, + GUC_EXPLAIN + }, + &enable_seqscan_prefetch, + true, + NULL, NULL, NULL + }, + { + {"enable_indexscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of heap pages in index scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexscan_prefetch, + true, + NULL, NULL, NULL + }, + { + {"enable_indexonlyscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of leave pages in index-only scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexonlyscan_prefetch, + true, + NULL, NULL, NULL + }, { {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of sequential-scan plans."), @@ -1997,6 +2029,16 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"neon_test_evict", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Evict unpinned pages (for better test coverage)"), + }, + &zenith_test_evict, + false, + NULL, NULL, NULL + }, + + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -2263,6 +2305,16 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"lsn_cache_size", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Size of last written LSN cache used by Neon."), + NULL + }, + &lastWrittenLsnCacheSize, + 128*1024, 1024, INT_MAX, + NULL, NULL, NULL + }, + { {"temp_buffers", PGC_USERSET, RESOURCES_MEM, gettext_noop("Sets the maximum number of temporary buffers used by each session."), @@ -2811,6 +2863,42 @@ struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"max_replication_apply_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal write lag between master and replicas."), + gettext_noop("When lag between minimal apply position of replica and current LSN exceeds this value," + "backends are blocked."), + GUC_UNIT_MB, + }, + &max_replication_apply_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + + { + {"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal flush lag between master and replicas."), + gettext_noop("When lag between minimal flush position of replica and current LSN exceeds this value," + "backends are blocked"), + GUC_UNIT_MB, + }, + &max_replication_flush_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + + { + {"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal write lag between master and replicas."), + gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value," + "backends are blocked"), + GUC_UNIT_MB, + }, + &max_replication_write_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + { {"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING, gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."), diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 3afe14cb4b7..a4b5d6c4590 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -3092,6 +3092,7 @@ main(int argc, char *argv[]) {"locale-provider", required_argument, NULL, 15}, {"icu-locale", required_argument, NULL, 16}, {"icu-rules", required_argument, NULL, 17}, + {"sysid", required_argument, NULL, 18}, {NULL, 0, NULL, 0} }; @@ -3271,6 +3272,9 @@ main(int argc, char *argv[]) case 17: icu_rules = pg_strdup(optarg); break; + case 18: + boot_options = psprintf("%s -s %s", boot_options, optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 96845e1a1ae..221aec84a21 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -29,9 +29,12 @@ #include "common/logging.h" #include "common/relpath.h" #include "getopt_long.h" +#include "port/pg_bitutils.h" #include "rmgrdesc.h" #include "storage/bufpage.h" +#define OFFSET_INVALID ((size_t)-1) + /* * NOTE: For any code change or issue fix here, it is highly recommended to * give a thought about doing the same in pg_walinspect contrib module as well. @@ -50,8 +53,10 @@ typedef struct XLogDumpPrivate XLogRecPtr startptr; XLogRecPtr endptr; bool endptr_reached; + char* input_filename; } XLogDumpPrivate; + typedef struct XLogDumpConfig { /* display options */ @@ -63,6 +68,8 @@ typedef struct XLogDumpConfig bool stats; bool stats_per_record; + bool ignore_format_errors; + /* filter options */ bool filter_by_rmgr[RM_MAX_ID + 1]; bool filter_by_rmgr_enabled; @@ -94,6 +101,34 @@ sigint_handler(SIGNAL_ARGS) } #endif +/* calculate ceil(log base 2) of num */ +static int +my_log2(long num) +{ + /* + * guard against too-large input, which would be invalid for + * pg_ceil_log2_*() + */ + if (num > LONG_MAX / 2) + num = LONG_MAX / 2; + +#if SIZEOF_LONG < 8 + return pg_ceil_log2_32(num); +#else + return pg_ceil_log2_64(num); +#endif +} + +/* calculate first power of 2 >= num, bounded to what will fit in an int */ +static int +next_pow2_int(long num) +{ + if (num > INT_MAX / 2) + num = INT_MAX / 2; + return 1 << my_log2(num); +} + + static void print_rmgr_list(void) { @@ -337,6 +372,18 @@ WALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID tli = *tli_p; char fname[MAXPGPATH]; int tries; + XLogDumpPrivate *private = state->private_data; + + if(private->input_filename) + { + Assert(nextSegNo == 0); + + state->seg.ws_file = open_file_in_directory(state->segcxt.ws_dir, private->input_filename); + if (state->seg.ws_file >= 0) + return; + + pg_fatal("could not open file \"%s\": %m", private->input_filename); + } XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); @@ -407,6 +454,7 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { WALOpenSegment *seg = &errinfo.wre_seg; char fname[MAXPGPATH]; + char *actual_fname = private->input_filename ? private->input_filename : fname; XLogFileName(fname, seg->ws_tli, seg->ws_segno, state->segcxt.ws_segsize); @@ -415,11 +463,11 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { errno = errinfo.wre_errno; pg_fatal("could not read from file %s, offset %d: %m", - fname, errinfo.wre_off); + actual_fname, errinfo.wre_off); } else pg_fatal("could not read from file %s, offset %d: read %d of %d", - fname, errinfo.wre_off, errinfo.wre_read, + actual_fname, errinfo.wre_off, errinfo.wre_read, errinfo.wre_req); } @@ -495,7 +543,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath) char forkname[FORKNAMECHARS + 2]; /* _ + terminating zero */ FILE *file; BlockNumber blk; - RelFileLocator rnode; + RelFileLocator rlocator; ForkNumber fork; if (!XLogRecHasBlockRef(record, block_id)) @@ -511,7 +559,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath) pg_fatal("%s", record->errormsg_buf); (void) XLogRecGetBlockTagExtended(record, block_id, - &rnode, &fork, &blk, NULL); + &rlocator, &fork, &blk, NULL); if (fork >= 0 && fork <= MAX_FORKNUM) sprintf(forkname, "_%s", forkNames[fork]); @@ -521,7 +569,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath) snprintf(filename, MAXPGPATH, "%s/%08X-%08X-%08X.%u.%u.%u.%u%s", savepath, record->seg.ws_tli, LSN_FORMAT_ARGS(record->ReadRecPtr), - rnode.spcOid, rnode.dbOid, rnode.relNumber, blk, forkname); + rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blk, forkname); file = fopen(filename, PG_BINARY_W); if (!file) @@ -547,16 +595,26 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) uint32 fpi_len; uint8 info = XLogRecGetInfo(record); XLogRecPtr xl_prev = XLogRecGetPrev(record); + XLogDumpPrivate *private = record->private_data; StringInfoData s; XLogRecGetLen(record, &rec_len, &fpi_len); - printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", - desc->rm_name, - rec_len, XLogRecGetTotalLen(record), - XLogRecGetXid(record), - LSN_FORMAT_ARGS(record->ReadRecPtr), - LSN_FORMAT_ARGS(xl_prev)); + if(private->input_filename) + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, offset: 0x%lX, prev %X/%08X, ", + desc->rm_name, + rec_len, XLogRecGetTotalLen(record), + XLogRecGetXid(record), + record->ReadRecPtr, + LSN_FORMAT_ARGS(xl_prev)); + else + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", + desc->rm_name, + rec_len, XLogRecGetTotalLen(record), + XLogRecGetXid(record), + LSN_FORMAT_ARGS(record->ReadRecPtr), + LSN_FORMAT_ARGS(xl_prev)); + id = desc->rm_identify(info); if (id == NULL) @@ -762,7 +820,10 @@ usage(void) printf(_(" -f, --follow keep retrying after reaching end of WAL\n")); printf(_(" -F, --fork=FORK only show records that modify blocks in fork FORK;\n" " valid names are main, fsm, vm, init\n")); + printf(_(" -i, --ignore ignore format errors, skip invalid structures\n")); + printf(_(" -N, --file=FNAME dump log records from a single file\n")); printf(_(" -n, --limit=N number of records to display\n")); + printf(_(" -o, --offset=OFFSET offset of the first record to in a file to dump\n")); printf(_(" -p, --path=PATH directory in which to find WAL segment files or a\n" " directory with a ./pg_wal that contains such files\n" " (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n")); @@ -797,6 +858,9 @@ main(int argc, char **argv) XLogRecPtr first_record; char *waldir = NULL; char *errormsg; + char *fName = NULL; + bool single_file = false; + size_t start_offset = OFFSET_INVALID; static struct option long_options[] = { {"bkp-details", no_argument, NULL, 'b'}, @@ -804,6 +868,9 @@ main(int argc, char **argv) {"end", required_argument, NULL, 'e'}, {"follow", no_argument, NULL, 'f'}, {"fork", required_argument, NULL, 'F'}, + {"file", required_argument, NULL, 'N'}, + {"ignore", no_argument, NULL, 'i'}, + {"offset", required_argument, NULL, 'o'}, {"fullpage", no_argument, NULL, 'w'}, {"help", no_argument, NULL, '?'}, {"limit", required_argument, NULL, 'n'}, @@ -853,6 +920,7 @@ main(int argc, char **argv) private.startptr = InvalidXLogRecPtr; private.endptr = InvalidXLogRecPtr; private.endptr_reached = false; + private.input_filename = NULL; config.quiet = false; config.bkp_details = false; @@ -871,6 +939,7 @@ main(int argc, char **argv) config.save_fullpage_path = NULL; config.stats = false; config.stats_per_record = false; + config.ignore_format_errors = false; stats.startptr = InvalidXLogRecPtr; stats.endptr = InvalidXLogRecPtr; @@ -881,7 +950,7 @@ main(int argc, char **argv) goto bad_argument; } - while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wx:z", + while ((option = getopt_long(argc, argv, "bB:e:fF:in:N:o:p:qr:R:s:t:wx:z", long_options, &optindex)) != -1) { switch (option) @@ -920,6 +989,13 @@ main(int argc, char **argv) } config.filter_by_extended = true; break; + case 'N': + fName = pg_strdup(optarg); + single_file = true; + break; + case 'i': + config.ignore_format_errors = true; + break; case 'n': if (sscanf(optarg, "%d", &config.stop_after_records) != 1) { @@ -927,6 +1003,13 @@ main(int argc, char **argv) goto bad_argument; } break; + case 'o': + if (sscanf(optarg, "%zu", &start_offset) != 1) + { + pg_log_error("could not parse offset \"%s\"", optarg); + goto bad_argument; + } + break; case 'p': waldir = pg_strdup(optarg); break; @@ -1092,6 +1175,73 @@ main(int argc, char **argv) goto bad_argument; } + if (start_offset != OFFSET_INVALID) + { + if(!XLogRecPtrIsInvalid(private.startptr) || !XLogRecPtrIsInvalid(private.endptr)) + { + pg_log_error("either file offset or start/end pointers should be specified"); + goto bad_argument; + } + + if(!single_file) + { + pg_log_error("offset option could only be used with filename option"); + goto bad_argument; + } + + /* Log records are maxaligned, start at the closest next position */ + private.startptr = MAXALIGN(start_offset); + } + + if(single_file) + { + char *directory = NULL; + int fd; + struct stat stat; + + if(config.follow) + { + pg_log_error("Follow could not be used in file dump mode"); + goto bad_argument; + } + + if (waldir != NULL) + { + pg_log_error("either single file or wal directory should be specified"); + goto bad_argument; + } + + split_path(fName, &directory, &private.input_filename); + waldir = directory; + + if(waldir == NULL) + { + char *cwd = malloc(MAXPGPATH); + + if (!getcwd(cwd, MAXPGPATH)) + pg_fatal("could identify current directory: %m"); + + waldir = cwd; + } + + if (!verify_directory(waldir)) + pg_fatal("could not open directory \"%s\": %m", waldir); + + fd = open_file_in_directory(waldir, private.input_filename); + if (fd < 0) + pg_fatal("could not open file \"%s\"", private.input_filename); + + if(fstat(fd, &stat) != 0) + pg_fatal("could not stat file \"%s\"", private.input_filename); + + private.endptr = stat.st_size; + + /* Round up segment size to next power of 2 or 1MB */ + WalSegSz = Max(next_pow2_int(private.endptr), 1024 * 1024); + + close(fd); + } + if (waldir != NULL) { /* validate path points to directory */ @@ -1113,6 +1263,12 @@ main(int argc, char **argv) int fd; XLogSegNo segno; + if(single_file) + { + pg_log_error("either single file or start/end boundaries should be specified"); + goto bad_argument; + } + split_path(argv[optind], &directory, &fname); if (waldir == NULL && directory != NULL) @@ -1185,10 +1341,11 @@ main(int argc, char **argv) } } else - waldir = identify_target_directory(waldir, NULL); + if (!single_file) + waldir = identify_target_directory(waldir, NULL); /* we don't know what to print */ - if (XLogRecPtrIsInvalid(private.startptr)) + if (XLogRecPtrIsInvalid(private.startptr) && !single_file) { pg_log_error("no start WAL location given"); goto bad_argument; @@ -1206,12 +1363,27 @@ main(int argc, char **argv) if (!xlogreader_state) pg_fatal("out of memory while allocating a WAL reading processor"); - /* first find a valid recptr to start from */ - first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + if(single_file) + { + if(config.ignore_format_errors) + { + xlogreader_state->skip_page_validation = true; + xlogreader_state->skip_invalid_records = true; + } + + xlogreader_state->skip_lsn_checks = true; + first_record = private.startptr; + XLogBeginRead(xlogreader_state, first_record); + } + else + { + /* first find a valid recptr to start from */ + first_record = XLogFindNextRecord(xlogreader_state, private.startptr); - if (first_record == InvalidXLogRecPtr) - pg_fatal("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(private.startptr)); + if (first_record == InvalidXLogRecPtr) + pg_fatal("could not find a valid record after %X/%X", + LSN_FORMAT_ARGS(private.startptr)); + } /* * Display a message that we're skipping data if `from` wasn't a pointer diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index faf50265191..bab88d9c01e 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -72,6 +72,10 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* prefetch info */ + int rs_prefetch_maximum; /* io_concurrency of tablespace */ + int rs_prefetch_target; /* current readahead target */ + /* these fields only used in page-at-a-time mode and for bitmap scans */ int rs_cindex; /* current tuple's index in vistuples */ int rs_ntuples; /* number of visible tuples on page */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9020abebc92..6a50f7a03ba 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1076,6 +1076,22 @@ typedef struct BTScanOpaqueData /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ + + /* Neon: prefetch state */ + int prefetch_maximum; /* maximal number of prefetch requests */ + + /* Prefech of referenced heap pages for index scan */ + /* To minimize waste prefetch requests we start with prefetch distance 0 + * and increase it until it reaches prefetch_maximum + */ + int current_prefetch_distance; + + /* Prefetch of leave pages of B-Tree for index-only scan */ + int n_prefetch_requests; /* number of active prefetch requests */ + int n_prefetch_blocks; /* number of elements in prefetch_blocks */ + int last_prefetch_index; /* current position in prefetch_blocks (prefetch_blocks[0..last_prefetch_index] are already requested */ + BlockNumber next_parent; /* pointer to next parent page */ + BlockNumber prefetch_blocks[MaxTIDsPerBTreePage + 1]; /* leaves + parent page */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -1242,6 +1258,7 @@ extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent, Snapshot snapshot); /* diff --git a/src/include/access/neon_xlog.h b/src/include/access/neon_xlog.h new file mode 100644 index 00000000000..465e1752b95 --- /dev/null +++ b/src/include/access/neon_xlog.h @@ -0,0 +1,132 @@ +#ifndef NEON_XLOG_H +#define NEON_XLOG_H + +/* + * The RMGR id of the Neon RMGR + * + * Reserved at https://wiki.postgresql.org/wiki/CustomWALResourceManagers + */ +#define RM_NEON_ID 134 + +#define XLOG_NEON_INIT_PAGE 0x80 +#define XLOG_NEON_OPMASK ((~XLOG_NEON_INIT_PAGE) & XLR_RMGR_INFO_MASK) + +/* from XLOG_HEAP_* */ +#define XLOG_NEON_HEAP_INSERT 0x00 +#define XLOG_NEON_HEAP_DELETE 0x10 +#define XLOG_NEON_HEAP_UPDATE 0x20 +#define XLOG_NEON_HEAP_HOT_UPDATE 0x30 +#define XLOG_NEON_HEAP_LOCK 0x40 +/* from XLOG_HEAP2_* */ +#define XLOG_NEON_HEAP_MULTI_INSERT 0x50 +/* 2 variants available */ + +/* */ +typedef struct xl_neon_heap_header { + uint16 t_infomask2; + uint16 t_infomask; + uint32 t_cid; + uint8 t_hoff; +} xl_neon_heap_header; + +#define SizeOfNeonHeapHeader (offsetof(xl_neon_heap_header, t_hoff) + sizeof(uint8)) + +/* This is what we need to know about insert */ +typedef struct xl_neon_heap_insert +{ + OffsetNumber offnum; /* inserted tuple's offset */ + uint8 flags; + + /* xl_neon_heap_header & TUPLE DATA in backup block 0 */ +} xl_neon_heap_insert; + +#define SizeOfNeonHeapInsert (offsetof(xl_neon_heap_insert, flags) + sizeof(uint8)) + +/* This is what we need to know about delete */ +typedef struct xl_neon_heap_delete +{ + TransactionId xmax; /* xmax of the deleted tuple */ + OffsetNumber offnum; /* deleted tuple's offset */ + uint8 infobits_set; /* infomask bits */ + uint8 flags; + uint32 t_cid; +} xl_neon_heap_delete; + +#define SizeOfNeonHeapDelete (offsetof(xl_neon_heap_delete, t_cid) + sizeof(uint32)) + +/* + * This is what we need to know about update|hot_update + * + * Backup blk 0: new page + * + * If XLH_UPDATE_PREFIX_FROM_OLD or XLH_UPDATE_SUFFIX_FROM_OLD flags are set, + * the prefix and/or suffix come first, as one or two uint16s. + * + * After that, xl_neon_heap_header and new tuple data follow. The new tuple + * data doesn't include the prefix and suffix, which are copied from the + * old tuple on replay. + * + * If XLH_UPDATE_CONTAINS_NEW_TUPLE flag is given, the tuple data is + * included even if a full-page image was taken. + * + * Backup blk 1: old page, if different. (no data, just a reference to the blk) + */ +typedef struct xl_neon_heap_update +{ + TransactionId old_xmax; /* xmax of the old tuple */ + OffsetNumber old_offnum; /* old tuple's offset */ + uint8 old_infobits_set; /* infomask bits to set on old tuple */ + uint8 flags; + uint32 t_cid; + TransactionId new_xmax; /* xmax of the new tuple */ + OffsetNumber new_offnum; /* new tuple's offset */ + + /* + * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags + * are set, xl_neon_heap_header and tuple data for the old tuple follow. + */ +} xl_neon_heap_update; +#define SizeOfNeonHeapUpdate (offsetof(xl_neon_heap_update, new_offnum) + sizeof(OffsetNumber)) + +typedef struct xl_neon_heap_lock +{ + TransactionId xmax; /* might be a MultiXactId */ + uint32 t_cid; + OffsetNumber offnum; /* locked tuple's offset on page */ + uint8 infobits_set; /* infomask and infomask2 bits to set */ + uint8 flags; /* XLH_LOCK_* flag bits */ +} xl_neon_heap_lock; +#define SizeOfNeonHeapLock (offsetof(xl_neon_heap_lock, flags) + sizeof(uint8)) + +/* + * This is what we need to know about a multi-insert. + * + * The main data of the record consists of this xl_neon_heap_multi_insert header. + * 'offsets' array is omitted if the whole page is reinitialized + * (XLOG_HEAP_INIT_PAGE). + * + * In block 0's data portion, there is an xl_neon_multi_insert_tuple struct, + * followed by the tuple data for each tuple. There is padding to align + * each xl_neon_multi_insert_tuple struct. + */ +typedef struct xl_neon_heap_multi_insert +{ + uint8 flags; + uint16 ntuples; + uint32 t_cid; + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} xl_neon_heap_multi_insert; + +#define SizeOfNeonHeapMultiInsert offsetof(xl_neon_heap_multi_insert, offsets) + +typedef struct xl_neon_multi_insert_tuple +{ + uint16 datalen; /* size of tuple data that follows */ + uint16 t_infomask2; + uint16 t_infomask; + uint8 t_hoff; + /* TUPLE DATA FOLLOWS AT END OF STRUCT */ +} xl_neon_multi_insert_tuple; +#define SizeOfNeonMultiInsertTuple (offsetof(xl_neon_multi_insert_tuple, t_hoff) + sizeof(uint8)) + +#endif //NEON_XLOG_H diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 48ca8523810..7643bb0c85f 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -16,6 +16,8 @@ #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" +#include "storage/block.h" +#include "storage/relfilelocator.h" /* Sync methods */ @@ -30,6 +32,15 @@ extern PGDLLIMPORT XLogRecPtr ProcLastRecPtr; extern PGDLLIMPORT XLogRecPtr XactLastRecEnd; extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; +/* + * Pseudo block number used to associate LSN with relation metadata (relation size) + */ +#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber + +extern bool ZenithRecoveryRequested; +extern XLogRecPtr zenithLastRec; +extern bool zenithWriteOk; + /* these variables are GUC parameters related to XLOG */ extern PGDLLIMPORT int wal_segment_size; extern PGDLLIMPORT int min_wal_size_mb; @@ -53,6 +64,8 @@ extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +extern int lastWrittenLsnCacheSize; + /* Archive modes */ typedef enum ArchiveMode @@ -246,6 +259,21 @@ extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI); extern TimeLineID GetWALInsertionTimeLine(void); extern XLogRecPtr GetLastImportantRecPtr(void); +/* neon specifics */ + +extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum, BlockNumber blkno); +extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks); +extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn); +extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum); +extern XLogRecPtr GetLastWrittenLSN(RelFileLocator relfilenode, ForkNumber forknum, BlockNumber blkno); + +extern void SetRedoStartLsn(XLogRecPtr RedoStartLSN); +extern XLogRecPtr GetRedoStartLsn(void); + +extern void SetZenithCurrentClusterSize(uint64 size); +extern uint64 GetZenithCurrentClusterSize(void); + + extern void SetWalWriterSleeping(bool sleeping); /* @@ -296,6 +324,8 @@ extern SessionBackupState get_backup_status(void); #define TABLESPACE_MAP "tablespace_map" #define TABLESPACE_MAP_OLD "tablespace_map.old" +#define ZENITH_SIGNAL_FILE "zenith.signal" + /* files to signal promotion to primary */ #define PROMOTE_SIGNAL_FILE "promote" diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 31785dc578f..e29c27345ce 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -38,6 +38,10 @@ #define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image * is taken */ +extern int max_replication_apply_lag; +extern int max_replication_flush_lag; +extern int max_replication_write_lag; + /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); extern void XLogSetRecordFlags(uint8 flags); diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index da32c7db772..35c25054f17 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -216,6 +216,10 @@ struct XLogReaderState /* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */ XLogRecPtr overwrittenRecPtr; + /* Disable validation to allow dumping corrupt WAL */ + bool skip_page_validation; + bool skip_invalid_records; + bool skip_lsn_checks; /* ---------------------------------------- * Decoded representation of current record @@ -441,4 +445,7 @@ extern bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, BlockNumber *blknum, Buffer *prefetch_buffer); +extern DecodedXLogRecord * +XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized); + #endif /* XLOGREADER_H */ diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 47c29350f5d..becbf4a3735 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -136,6 +136,7 @@ extern void ShutdownWalRecovery(void); extern void RemovePromoteSignalFiles(void); extern bool HotStandbyActive(void); +extern void XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); extern RecoveryPauseState GetRecoveryPauseState(void); extern void SetRecoveryPause(bool recoveryPause); @@ -155,4 +156,6 @@ extern void RecoveryRequiresIntParameter(const char *param_name, int currValue, extern void xlog_outdesc(StringInfo buf, XLogReaderState *record); +extern void XLogUpdateWalBuffers(char* data, XLogRecPtr start, size_t len); + #endif /* XLOGRECOVERY_H */ diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 5b77b11f508..153ab3e1398 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -81,6 +81,12 @@ typedef struct ReadLocalXLogPageNoWaitPrivate bool end_of_wal; /* true, when end of WAL is reached */ } ReadLocalXLogPageNoWaitPrivate; +/* + * Returns true if we shouldn't do REDO on that block in record indicated by + * block_id; false otherwise. + */ +extern bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf); extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 3d3e632a0cc..e17e10b1ea4 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -46,6 +46,7 @@ typedef struct ExplainState bool timing; /* print detailed node timing */ bool summary; /* print total planning and execution timing */ bool settings; /* print modified settings */ + bool prefetch; /* print prefetch statistic */ bool generic; /* generate a generic plan */ ExplainFormat format; /* output format */ /* state for output formatting --- not reset for each new plan tree */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 87e5e2183bd..3164c04dacc 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -15,6 +15,14 @@ #include "portability/instr_time.h" +/* Prefeth statistics */ +typedef struct +{ + int64 hits; + int64 misses; + int64 expired; + int64 duplicates; +} PrefetchInfo; /* * BufferUsage and WalUsage counters keep being incremented infinitely, @@ -37,6 +45,7 @@ typedef struct BufferUsage instr_time blk_write_time; /* time spent writing blocks */ instr_time temp_blk_read_time; /* time spent reading temp blocks */ instr_time temp_blk_write_time; /* time spent writing temp blocks */ + PrefetchInfo prefetch; /* prefetch statistics */ } BufferUsage; /* diff --git a/src/include/fmgr.h b/src/include/fmgr.h index b120f5e7fef..be711d2fcfc 100644 --- a/src/include/fmgr.h +++ b/src/include/fmgr.h @@ -797,4 +797,10 @@ extern PGDLLIMPORT fmgr_hook_type fmgr_hook; #define FmgrHookIsNeeded(fn_oid) \ (!needs_fmgr_hook ? false : (*needs_fmgr_hook)(fn_oid)) + + +// download_extension_file_hook (filename, is_library) +typedef bool (*download_extension_file_hook_type) (const char *, bool); +extern PGDLLIMPORT download_extension_file_hook_type download_extension_file_hook; + #endif /* FMGR_H */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 14bd574fc24..95d19a761fc 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -107,6 +107,10 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount; /* in tcop/postgres.c */ extern void ProcessInterrupts(void); +/* Callback called by ProcessInterrupts in the loop while it is returning true. */ +typedef bool (*process_interrupts_callback_t)(void); +extern process_interrupts_callback_t ProcessInterruptsCallback; + /* Test whether an interrupt is pending */ #ifndef WIN32 #define INTERRUPTS_PENDING_CONDITION() \ @@ -504,4 +508,7 @@ extern void RestoreClientConnectionInfo(char *conninfo); /* in executor/nodeHash.c */ extern size_t get_hash_memory_limit(void); +/* in storage/buffer/buf_init.c */ +extern bool am_wal_redo_postgres; + #endif /* MISCADMIN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 869465d6f80..c467dbf8d70 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -38,6 +38,7 @@ #include "nodes/plannodes.h" #include "nodes/tidbitmap.h" #include "partitioning/partdefs.h" +#include "storage/bufmgr.h" #include "storage/condition_variable.h" #include "utils/hsearch.h" #include "utils/queryenvironment.h" @@ -1698,6 +1699,15 @@ typedef struct ParallelBitmapHeapState char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; } ParallelBitmapHeapState; +typedef struct TBMIteratePrefetchResult +{ + BlockNumber blockno; /* page number containing tuples */ + int ntuples; /* -1 indicates lossy result */ + bool recheck; /* should the tuples be rechecked? */ + /* Note: recheck is always true if ntuples < 0 */ + OffsetNumber offsets[MaxHeapTuplesPerPage]; +} TBMIteratePrefetchResult; + /* ---------------- * BitmapHeapScanState information * @@ -1718,7 +1728,6 @@ typedef struct ParallelBitmapHeapState * pscan_len size of the shared memory for parallel bitmap * initialized is node is ready to iterate * shared_tbmiterator shared iterator - * shared_prefetch_iterator shared iterator for prefetching * pstate shared state for parallel bitmap scan * ---------------- */ @@ -1742,7 +1751,10 @@ typedef struct BitmapHeapScanState Size pscan_len; bool initialized; TBMSharedIterator *shared_tbmiterator; - TBMSharedIterator *shared_prefetch_iterator; + /* parallel worker private ring buffer with prefetch requests: it allows to access prefetch result from the same worker */ + TBMIteratePrefetchResult prefetch_requests[MAX_IO_CONCURRENCY]; + TBMIteratePrefetchResult tbmres_copy; /* copy of current iterator result */ + int prefetch_head; /* head position in ring buffer */ ParallelBitmapHeapState *pstate; } BitmapHeapScanState; diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 6cf49705d3a..7b17d328fec 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -70,6 +70,10 @@ extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; extern PGDLLIMPORT bool enable_presorted_aggregate; extern PGDLLIMPORT bool enable_async_append; +extern PGDLLIMPORT bool enable_seqscan_prefetch; +extern PGDLLIMPORT bool enable_indexscan_prefetch; +extern PGDLLIMPORT bool enable_indexonlyscan_prefetch; + extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 174544630e3..a304f7ac15f 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -274,6 +274,9 @@ /* Define if you have a function readline library */ #undef HAVE_LIBREADLINE +/* Define to 1 if you have the `seccomp' library (-lseccomp). */ +#undef HAVE_LIBSECCOMP + /* Define to 1 if you have the `selinux' library (-lselinux). */ #undef HAVE_LIBSELINUX diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index a1a93ad706e..2870e31d087 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -57,7 +57,7 @@ * version. Example: "ACME Postgres/1.2". Note that the string will appear * in a user-facing error message if an ABI mismatch is detected. */ -#define FMGR_ABI_EXTRA "PostgreSQL" +#define FMGR_ABI_EXTRA "Neon Postgres" /* * Maximum number of columns in an index. There is little point in making diff --git a/src/include/replication/message.h b/src/include/replication/message.h index 6ce7f2038b2..ee39b8573df 100644 --- a/src/include/replication/message.h +++ b/src/include/replication/message.h @@ -32,6 +32,9 @@ typedef struct xl_logical_message extern XLogRecPtr LogLogicalMessage(const char *prefix, const char *message, size_t size, bool transactional); +extern void wallog_file(char const* path); +extern void wallog_file_descriptor(char const* path, int fd); + /* RMGR API */ #define XLOG_LOGICAL_MESSAGE 0x00 extern void logicalmsg_redo(XLogReaderState *record); diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index 9df7e50f943..87447c14afa 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -12,6 +12,7 @@ #ifndef _WALSENDER_H #define _WALSENDER_H +#include "access/xlog.h" #include /* @@ -48,6 +49,25 @@ extern void WalSndWaitStopping(void); extern void HandleWalSndInitStopping(void); extern void WalSndRqstFileReload(void); +/* + * Hook to check for WAL receiving backpressure. + * Return value in microseconds + */ +extern uint64 (*delay_backend_us)(void); + +/* expose these so that they can be reused by the neon walproposer extension */ +extern void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); +extern TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); +extern void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr, + XLogRecPtr applyPtr, TimestampTz replyTime, + bool replyRequested); +extern void PhysicalConfirmReceivedLocation(XLogRecPtr lsn); +extern void ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch); + /* * Remember that we want to wakeup walsenders later * diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 98cd2499098..4e412727d7f 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -310,6 +310,8 @@ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; extern PGDLLIMPORT WritebackContext BackendWritebackContext; +extern Buffer wal_redo_buffer; + /* in localbuf.c */ extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index b379c76e273..6727de5d0a0 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -137,6 +137,8 @@ extern PGDLLIMPORT int checkpoint_flush_after; extern PGDLLIMPORT int backend_flush_after; extern PGDLLIMPORT int bgwriter_flush_after; +extern bool zenith_test_evict; + /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; @@ -145,8 +147,8 @@ extern PGDLLIMPORT int NLocBuffer; extern PGDLLIMPORT Block *LocalBufferBlockPointers; extern PGDLLIMPORT int32 *LocalRefCount; -/* upper limit for effective_io_concurrency */ -#define MAX_IO_CONCURRENCY 1000 +/* upper limit for effective_io_concurrency (better to he power of 2) */ +#define MAX_IO_CONCURRENCY 1024 /* special block number for ReadBuffer() */ #define P_NEW InvalidBlockNumber /* grow the file to get a new page */ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a9a179aabac..0200b49b598 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -18,6 +18,14 @@ #include "storage/block.h" #include "storage/relfilelocator.h" +struct f_smgr; + +/* + * Neon: extended SMGR API. + * This define can be used by extensions to determine that them are built for Neon. + */ +#define NEON_SMGR 1 + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -41,6 +49,9 @@ typedef struct SMgrRelationData /* rlocator is the hashtable lookup key, so it must be first! */ RelFileLocatorBackend smgr_rlocator; /* relation physical identifier */ + /* copy of pg_class.relpersistence, or 0 if not known */ + char smgr_relpersistence; + /* pointer to owning pointer, or NULL if none */ struct SMgrRelationData **smgr_owner; @@ -59,7 +70,7 @@ typedef struct SMgrRelationData * Fields below here are intended to be private to smgr.c and its * submodules. Do not touch them from elsewhere. */ - int smgr_which; /* storage manager selector */ + const struct f_smgr *smgr; /* * for md.c; per-fork arrays of the number of open segments @@ -77,8 +88,71 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator) +/* + * This struct of function pointers defines the API between smgr.c and + * any individual storage manager module. Note that smgr subfunctions are + * generally expected to report problems via elog(ERROR). An exception is + * that smgr_unlink should use elog(WARNING), rather than erroring out, + * because we normally unlink relations during post-commit/abort cleanup, + * and so it's too late to raise an error. Also, various conditions that + * would normally be errors should be allowed during bootstrap and/or WAL + * recovery --- see comments in md.c for details. + */ +typedef struct f_smgr +{ + void (*smgr_init) (void); /* may be NULL */ + void (*smgr_shutdown) (void); /* may be NULL */ + void (*smgr_open) (SMgrRelation reln); + void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, + bool isRedo); + bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, + bool isRedo); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, + bool skipFsync); + bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); + void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, void *buffer); + void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); + BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); + void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); + + void (*smgr_start_unlogged_build) (SMgrRelation reln); + void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); + void (*smgr_end_unlogged_build) (SMgrRelation reln); + + int (*smgr_read_slru_segment) (SMgrRelation reln, const char *path, int segno, void* buffer); +} f_smgr; + +typedef void (*smgr_init_hook_type) (void); +typedef void (*smgr_shutdown_hook_type) (void); +extern PGDLLIMPORT smgr_init_hook_type smgr_init_hook; +extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook; +extern void smgr_init_standard(void); +extern void smgr_shutdown_standard(void); + +// Alternative implementation of calculate_database_size() +typedef int64 (*dbsize_hook_type) (Oid dbOid); +extern PGDLLIMPORT dbsize_hook_type dbsize_hook; + +typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileLocator rlocator); +extern PGDLLIMPORT smgr_hook_type smgr_hook; +extern const f_smgr *smgr_standard(BackendId backend, RelFileLocator rlocator); + +extern const f_smgr *smgr(BackendId backend, RelFileLocator rlocator); + extern void smgrinit(void); -extern SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend); +extern SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend, char relpersistence); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln); @@ -110,4 +184,11 @@ extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void AtEOXact_SMgr(void); extern bool ProcessBarrierSmgrRelease(void); +/* Neon: Change relpersistence for unlogged index builds */ +extern void smgr_start_unlogged_build(SMgrRelation reln); +extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); +extern void smgr_end_unlogged_build(SMgrRelation reln); + +extern int smgr_read_slru_segment(SMgrRelation reln, const char *path, int segno, void* buffer); + #endif /* SMGR_H */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 1426a353cd0..c4174da1b1b 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -572,7 +572,7 @@ static inline SMgrRelation RelationGetSmgr(Relation rel) { if (unlikely(rel->rd_smgr == NULL)) - smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_locator, rel->rd_backend)); + smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_locator, rel->rd_backend, rel->rd_rel->relpersistence)); return rel->rd_smgr; } diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 2adc5df2e79..a02c162b27e 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -150,7 +150,8 @@ typedef enum WAIT_EVENT_REGISTER_SYNC_REQUEST, WAIT_EVENT_SPIN_DELAY, WAIT_EVENT_VACUUM_DELAY, - WAIT_EVENT_VACUUM_TRUNCATE + WAIT_EVENT_VACUUM_TRUNCATE, + WAIT_EVENT_BACK_PRESSURE } WaitEventTimeout; /* ---------- diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out index 7cb2f7cc02b..f02f0205427 100644 --- a/src/test/regress/expected/sequence.out +++ b/src/test/regress/expected/sequence.out @@ -260,7 +260,7 @@ SELECT nextval('foo_seq_new'); -- log_cnt can be higher if there is a checkpoint just at the right -- time, so just test for the expected range -SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; +SELECT last_value, log_cnt IN (0, 31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; last_value | log_cnt_ok | is_called ------------+------------+----------- 2 | t | t diff --git a/src/test/regress/expected/spgist.out b/src/test/regress/expected/spgist.out index 2e911285600..c371e04a795 100644 --- a/src/test/regress/expected/spgist.out +++ b/src/test/regress/expected/spgist.out @@ -94,3 +94,6 @@ select box(point(i,j)) from generate_series(1,100,5) i, generate_series(1,10,5) j; -- leave this table around, to help in testing dump/restore +-- NEON: In Neon unlogged tables are wiped away on node restart +-- so drop the table to keep Neon tests clean. +drop table spgist_unlogged_tbl; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 001c6e7eb9d..10bf2101f00 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -118,7 +118,9 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashjoin | on enable_incremental_sort | on enable_indexonlyscan | on + enable_indexonlyscan_prefetch | on enable_indexscan | on + enable_indexscan_prefetch | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -130,9 +132,10 @@ select name, setting from pg_settings where name like 'enable%'; enable_partitionwise_join | off enable_presorted_aggregate | on enable_seqscan | on + enable_seqscan_prefetch | on enable_sort | on enable_tidscan | on -(21 rows) +(24 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/expected/tablespace_1.out b/src/test/regress/expected/tablespace_1.out new file mode 100644 index 00000000000..5b9cefa6057 --- /dev/null +++ b/src/test/regress/expected/tablespace_1.out @@ -0,0 +1,974 @@ +-- relative tablespace locations are not allowed +CREATE TABLESPACE regress_tblspace LOCATION 'relative'; -- fail +ERROR: tablespace location must be an absolute path +-- empty tablespace locations are not usually allowed +CREATE TABLESPACE regress_tblspace LOCATION ''; -- fail +ERROR: tablespace location must be an absolute path +-- as a special developer-only option to allow us to use tablespaces +-- with streaming replication on the same server, an empty location +-- can be allowed as a way to say that the tablespace should be created +-- as a directory in pg_tblspc, rather than being a symlink +SET allow_in_place_tablespaces = true; +-- create a tablespace using WITH clause +CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (some_nonexistent_parameter = true); -- fail +ERROR: unrecognized parameter "some_nonexistent_parameter" +CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (random_page_cost = 3.0); -- ok +-- check to see the parameter was used +SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith'; + spcoptions +------------------------ + {random_page_cost=3.0} +(1 row) + +-- drop the tablespace so we can re-use the location +DROP TABLESPACE regress_tblspacewith; +-- This returns a relative path as of an effect of allow_in_place_tablespaces, +-- masking the tablespace OID used in the path name. +SELECT regexp_replace(pg_tablespace_location(oid), '(pg_tblspc)/(\d+)', '\1/NNN') + FROM pg_tablespace WHERE spcname = 'regress_tblspace'; + regexp_replace +---------------- + pg_tblspc/NNN +(1 row) + +-- try setting and resetting some properties for the new tablespace +ALTER TABLESPACE regress_tblspace SET (random_page_cost = 1.0, seq_page_cost = 1.1); +ALTER TABLESPACE regress_tblspace SET (some_nonexistent_parameter = true); -- fail +ERROR: unrecognized parameter "some_nonexistent_parameter" +ALTER TABLESPACE regress_tblspace RESET (random_page_cost = 2.0); -- fail +ERROR: RESET must not include values for parameters +ALTER TABLESPACE regress_tblspace RESET (random_page_cost, effective_io_concurrency); -- ok +-- REINDEX (TABLESPACE) +-- catalogs and system tablespaces +-- system catalog, fail +REINDEX (TABLESPACE regress_tblspace) TABLE pg_am; +ERROR: cannot move system relation "pg_am_name_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_am; +ERROR: cannot reindex system catalogs concurrently +-- shared catalog, fail +REINDEX (TABLESPACE regress_tblspace) TABLE pg_authid; +ERROR: cannot move system relation "pg_authid_rolname_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_authid; +ERROR: cannot reindex system catalogs concurrently +-- toast relations, fail +REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1260_index; +ERROR: cannot move system relation "pg_toast_1260_index" +REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1260_index; +ERROR: cannot reindex system catalogs concurrently +REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1260; +ERROR: cannot move system relation "pg_toast_1260_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1260; +ERROR: cannot reindex system catalogs concurrently +-- system catalog, fail +REINDEX (TABLESPACE pg_global) TABLE pg_authid; +ERROR: cannot move system relation "pg_authid_rolname_index" +REINDEX (TABLESPACE pg_global) TABLE CONCURRENTLY pg_authid; +ERROR: cannot reindex system catalogs concurrently +-- table with toast relation +CREATE TABLE regress_tblspace_test_tbl (num1 bigint, num2 double precision, t text); +INSERT INTO regress_tblspace_test_tbl (num1, num2, t) + SELECT round(random()*100), random(), 'text' + FROM generate_series(1, 10) s(i); +CREATE INDEX regress_tblspace_test_tbl_idx ON regress_tblspace_test_tbl (num1); +-- move to global tablespace, fail +REINDEX (TABLESPACE pg_global) INDEX regress_tblspace_test_tbl_idx; +ERROR: only shared relations can be placed in pg_global tablespace +REINDEX (TABLESPACE pg_global) INDEX CONCURRENTLY regress_tblspace_test_tbl_idx; +ERROR: cannot move non-shared relation to tablespace "pg_global" +-- check transactional behavior of REINDEX (TABLESPACE) +BEGIN; +REINDEX (TABLESPACE regress_tblspace) INDEX regress_tblspace_test_tbl_idx; +REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl; +ROLLBACK; +-- no relation moved to the new tablespace +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'; + relname +--------- +(0 rows) + +-- check that all indexes are moved to a new tablespace with different +-- relfilenode. +-- Save first the existing relfilenode for the toast and main relations. +SELECT relfilenode as main_filenode FROM pg_class + WHERE relname = 'regress_tblspace_test_tbl_idx' \gset +SELECT relfilenode as toast_filenode FROM pg_class + WHERE oid = + (SELECT i.indexrelid + FROM pg_class c, + pg_index i + WHERE i.indrelid = c.reltoastrelid AND + c.relname = 'regress_tblspace_test_tbl') \gset +REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE regress_tblspace; +ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE pg_default; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +-- Move back to the default tablespace. +ALTER INDEX regress_tblspace_test_tbl_idx SET TABLESPACE pg_default; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +--------- +(0 rows) + +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE regress_tblspace_test_tbl; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +SELECT relfilenode = :main_filenode AS main_same FROM pg_class + WHERE relname = 'regress_tblspace_test_tbl_idx'; + main_same +----------- + f +(1 row) + +SELECT relfilenode = :toast_filenode as toast_same FROM pg_class + WHERE oid = + (SELECT i.indexrelid + FROM pg_class c, + pg_index i + WHERE i.indrelid = c.reltoastrelid AND + c.relname = 'regress_tblspace_test_tbl'); + toast_same +------------ + f +(1 row) + +DROP TABLE regress_tblspace_test_tbl; +-- REINDEX (TABLESPACE) with partitions +-- Create a partition tree and check the set of relations reindexed +-- with their new tablespace. +CREATE TABLE tbspace_reindex_part (c1 int, c2 int) PARTITION BY RANGE (c1); +CREATE TABLE tbspace_reindex_part_0 PARTITION OF tbspace_reindex_part + FOR VALUES FROM (0) TO (10) PARTITION BY list (c2); +CREATE TABLE tbspace_reindex_part_0_1 PARTITION OF tbspace_reindex_part_0 + FOR VALUES IN (1); +CREATE TABLE tbspace_reindex_part_0_2 PARTITION OF tbspace_reindex_part_0 + FOR VALUES IN (2); +-- This partitioned table will have no partitions. +CREATE TABLE tbspace_reindex_part_10 PARTITION OF tbspace_reindex_part + FOR VALUES FROM (10) TO (20) PARTITION BY list (c2); +-- Create some partitioned indexes +CREATE INDEX tbspace_reindex_part_index ON ONLY tbspace_reindex_part (c1); +CREATE INDEX tbspace_reindex_part_index_0 ON ONLY tbspace_reindex_part_0 (c1); +ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_0; +-- This partitioned index will have no partitions. +CREATE INDEX tbspace_reindex_part_index_10 ON ONLY tbspace_reindex_part_10 (c1); +ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_10; +CREATE INDEX tbspace_reindex_part_index_0_1 ON ONLY tbspace_reindex_part_0_1 (c1); +ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_1; +CREATE INDEX tbspace_reindex_part_index_0_2 ON ONLY tbspace_reindex_part_0_2 (c1); +ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_2; +SELECT relid, parentrelid, level FROM pg_partition_tree('tbspace_reindex_part_index') + ORDER BY relid, level; + relid | parentrelid | level +--------------------------------+------------------------------+------- + tbspace_reindex_part_index | | 0 + tbspace_reindex_part_index_0 | tbspace_reindex_part_index | 1 + tbspace_reindex_part_index_10 | tbspace_reindex_part_index | 1 + tbspace_reindex_part_index_0_1 | tbspace_reindex_part_index_0 | 2 + tbspace_reindex_part_index_0_2 | tbspace_reindex_part_index_0 | 2 +(5 rows) + +-- Track the original tablespace, relfilenode and OID of each index +-- in the tree. +CREATE TEMP TABLE reindex_temp_before AS + SELECT oid, relname, relfilenode, reltablespace + FROM pg_class + WHERE relname ~ 'tbspace_reindex_part_index'; +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tbspace_reindex_part; +-- REINDEX CONCURRENTLY changes the OID of the old relation, hence a check +-- based on the relation name below. +SELECT b.relname, + CASE WHEN a.relfilenode = b.relfilenode THEN 'relfilenode is unchanged' + ELSE 'relfilenode has changed' END AS filenode, + CASE WHEN a.reltablespace = b.reltablespace THEN 'reltablespace is unchanged' + ELSE 'reltablespace has changed' END AS tbspace + FROM reindex_temp_before b JOIN pg_class a ON b.relname = a.relname + ORDER BY 1; + relname | filenode | tbspace +--------------------------------+--------------------------+---------------------------- + tbspace_reindex_part_index | relfilenode is unchanged | reltablespace is unchanged + tbspace_reindex_part_index_0 | relfilenode is unchanged | reltablespace is unchanged + tbspace_reindex_part_index_0_1 | relfilenode has changed | reltablespace has changed + tbspace_reindex_part_index_0_2 | relfilenode has changed | reltablespace has changed + tbspace_reindex_part_index_10 | relfilenode is unchanged | reltablespace is unchanged +(5 rows) + +DROP TABLE tbspace_reindex_part; +-- create a schema we can use +CREATE SCHEMA testschema; +-- try a table +CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'foo'; + relname | spcname +---------+------------------ + foo | regress_tblspace +(1 row) + +INSERT INTO testschema.foo VALUES(1); +INSERT INTO testschema.foo VALUES(2); +-- tables from dynamic sources +CREATE TABLE testschema.asselect TABLESPACE regress_tblspace AS SELECT 1; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'asselect'; + relname | spcname +----------+------------------ + asselect | regress_tblspace +(1 row) + +PREPARE selectsource(int) AS SELECT $1; +CREATE TABLE testschema.asexecute TABLESPACE regress_tblspace + AS EXECUTE selectsource(2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'asexecute'; + relname | spcname +-----------+------------------ + asexecute | regress_tblspace +(1 row) + +-- index +CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'foo_idx'; + relname | spcname +---------+------------------ + foo_idx | regress_tblspace +(1 row) + +-- check \d output +\d testschema.foo + Table "testschema.foo" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + i | integer | | | +Indexes: + "foo_idx" btree (i), tablespace "regress_tblspace" +Tablespace: "regress_tblspace" + +\d testschema.foo_idx + Index "testschema.foo_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + i | integer | yes | i +btree, for table "testschema.foo" +Tablespace: "regress_tblspace" + +-- +-- partitioned table +-- +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +ERROR: only shared relations can be placed in pg_global tablespace +RESET default_tablespace; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +SET default_tablespace TO regress_tblspace; +CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +ERROR: only shared relations can be placed in pg_global tablespace +ALTER TABLE testschema.part SET TABLESPACE regress_tblspace; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4) + TABLESPACE pg_default; +CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6) + PARTITION BY LIST (a); +ALTER TABLE testschema.part SET TABLESPACE pg_default; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +ERROR: only shared relations can be placed in pg_global tablespace +CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10) + PARTITION BY LIST (a) TABLESPACE regress_tblspace; +RESET default_tablespace; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +SELECT relname, spcname FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid) + LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid + where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname; + relname | spcname +----------+------------------ + part | + part_1 | + part_2 | regress_tblspace + part_3 | regress_tblspace + part_4 | + part_56 | regress_tblspace + part_78 | + part_910 | regress_tblspace +(8 rows) + +RESET default_tablespace; +DROP TABLE testschema.part; +-- partitioned index +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1); +CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx' ORDER BY relname; + relname | spcname +-------------+------------------ + part1_a_idx | regress_tblspace + part2_a_idx | regress_tblspace + part_a_idx | regress_tblspace +(3 rows) + +\d testschema.part + Partitioned table "testschema.part" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition key: LIST (a) +Indexes: + "part_a_idx" btree (a), tablespace "regress_tblspace" +Number of partitions: 2 (Use \d+ to list them.) + +\d+ testschema.part + Partitioned table "testschema.part" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition key: LIST (a) +Indexes: + "part_a_idx" btree (a), tablespace "regress_tblspace" +Partitions: testschema.part1 FOR VALUES IN (1), + testschema.part2 FOR VALUES IN (2) + +\d testschema.part1 + Table "testschema.part1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: testschema.part FOR VALUES IN (1) +Indexes: + "part1_a_idx" btree (a), tablespace "regress_tblspace" + +\d+ testschema.part1 + Table "testschema.part1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition of: testschema.part FOR VALUES IN (1) +Partition constraint: ((a IS NOT NULL) AND (a = 1)) +Indexes: + "part1_a_idx" btree (a), tablespace "regress_tblspace" + +\d testschema.part_a_idx +Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.part" +Number of partitions: 2 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d+ testschema.part_a_idx + Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+---------+------+------------+---------+-------------- + a | integer | yes | a | plain | +btree, for table "testschema.part" +Partitions: testschema.part1_a_idx, + testschema.part2_a_idx +Tablespace: "regress_tblspace" + +-- partitioned rels cannot specify the default tablespace. These fail: +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +SET default_tablespace TO 'pg_default'; +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +-- but these work: +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +SET default_tablespace TO ''; +CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a); +DROP TABLE testschema.dflt, testschema.dflt2; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; +INSERT INTO testschema.test_default_tab VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab (id); +CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab; + id +---- + 1 +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE int; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab; + id +---- + 1 +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE int; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +-- (this time with a partitioned table) +CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint) + PARTITION BY LIST (id) TABLESPACE regress_tblspace; +CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p + FOR VALUES IN (1); +INSERT INTO testschema.test_default_tab_p VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab_p (val); +CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab_p; +-- check that default_tablespace affects index additions in ALTER TABLE +CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace; +INSERT INTO testschema.test_tab VALUES (1); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (id); +SET default_tablespace TO ''; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_pkey + Index "testschema.test_tab_pkey" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_tab" + +SELECT * FROM testschema.test_tab; + id +---- + 1 +(1 row) + +DROP TABLE testschema.test_tab; +-- check that default_tablespace is handled correctly by multi-command +-- ALTER TABLE that includes a tablespace-preserving rewrite +CREATE TABLE testschema.test_tab(a int, b int, c int); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a); +CREATE INDEX test_tab_a_idx ON testschema.test_tab (a); +SET default_tablespace TO ''; +CREATE INDEX test_tab_b_idx ON testschema.test_tab (b); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + b | integer | yes | b +btree, for table "testschema.test_tab" + +ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+--------+------+------------ + b | bigint | yes | b +btree, for table "testschema.test_tab" + +DROP TABLE testschema.test_tab; +-- let's try moving a table from one place to another +CREATE TABLE testschema.atable AS VALUES (1), (2); +CREATE UNIQUE INDEX anindex ON testschema.atable(column1); +ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global; +ERROR: only shared relations can be placed in pg_global tablespace +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace; +INSERT INTO testschema.atable VALUES(3); -- ok +INSERT INTO testschema.atable VALUES(1); -- fail (checks index) +ERROR: duplicate key value violates unique constraint "anindex" +DETAIL: Key (column1)=(1) already exists. +SELECT COUNT(*) FROM testschema.atable; -- checks heap + count +------- + 3 +(1 row) + +-- let's try moving a materialized view from one place to another +CREATE MATERIALIZED VIEW testschema.amv AS SELECT * FROM testschema.atable; +ALTER MATERIALIZED VIEW testschema.amv SET TABLESPACE regress_tblspace; +REFRESH MATERIALIZED VIEW testschema.amv; +SELECT COUNT(*) FROM testschema.amv; + count +------- + 3 +(1 row) + +-- Will fail with bad path +CREATE TABLESPACE regress_badspace LOCATION '/no/such/location'; +ERROR: directory "/no/such/location" does not exist +-- No such tablespace +CREATE TABLE bar (i int) TABLESPACE regress_nosuchspace; +ERROR: tablespace "regress_nosuchspace" does not exist +-- Fail, in use for some partitioned object +DROP TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" cannot be dropped because some objects depend on it +DETAIL: tablespace for index testschema.part_a_idx +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +-- Fail, not empty +DROP TABLESPACE regress_tblspace; +CREATE ROLE regress_tablespace_user1 login; +CREATE ROLE regress_tablespace_user2 login; +GRANT USAGE ON SCHEMA testschema TO regress_tablespace_user2; +ALTER TABLESPACE regress_tblspace OWNER TO regress_tablespace_user1; +ERROR: tablespace "regress_tblspace" does not exist +CREATE TABLE testschema.tablespace_acl (c int); +-- new owner lacks permission to create this index from scratch +CREATE INDEX k ON testschema.tablespace_acl (c) TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE testschema.tablespace_acl OWNER TO regress_tablespace_user2; +SET SESSION ROLE regress_tablespace_user2; +CREATE TABLE tablespace_table (i int) TABLESPACE regress_tblspace; -- fail +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE testschema.tablespace_acl ALTER c TYPE bigint; +REINDEX (TABLESPACE regress_tblspace) TABLE tablespace_table; -- fail +ERROR: tablespace "regress_tblspace" does not exist +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tablespace_table; -- fail +ERROR: tablespace "regress_tblspace" does not exist +RESET ROLE; +ALTER TABLESPACE regress_tblspace RENAME TO regress_tblspace_renamed; +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER INDEX ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +-- Should show notice that nothing was done +ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +-- Should succeed +DROP TABLESPACE regress_tblspace_renamed; +ERROR: tablespace "regress_tblspace_renamed" does not exist +DROP SCHEMA testschema CASCADE; +NOTICE: drop cascades to 7 other objects +DETAIL: drop cascades to table testschema.foo +drop cascades to table testschema.asselect +drop cascades to table testschema.asexecute +drop cascades to table testschema.part +drop cascades to table testschema.atable +drop cascades to materialized view testschema.amv +drop cascades to table testschema.tablespace_acl +DROP ROLE regress_tablespace_user1; +DROP ROLE regress_tablespace_user2; diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql index 674f5f1f668..793f1415f6b 100644 --- a/src/test/regress/sql/sequence.sql +++ b/src/test/regress/sql/sequence.sql @@ -137,7 +137,7 @@ SELECT nextval('foo_seq_new'); SELECT nextval('foo_seq_new'); -- log_cnt can be higher if there is a checkpoint just at the right -- time, so just test for the expected range -SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; +SELECT last_value, log_cnt IN (0, 31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; DROP SEQUENCE foo_seq_new; -- renaming serial sequences diff --git a/src/test/regress/sql/spgist.sql b/src/test/regress/sql/spgist.sql index 4828ede68c3..9d6394516a2 100644 --- a/src/test/regress/sql/spgist.sql +++ b/src/test/regress/sql/spgist.sql @@ -89,3 +89,6 @@ select box(point(i,j)) from generate_series(1,100,5) i, generate_series(1,10,5) j; -- leave this table around, to help in testing dump/restore +-- NEON: In Neon unlogged tables are wiped away on node restart +-- so drop the table to keep Neon tests clean. +drop table spgist_unlogged_tbl;