From a297d13b9ae183f0b34f2bc5ffbd0165414e5718 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Thu, 20 Jul 2023 11:59:14 +0200
Subject: [PATCH 01/13] Patch PostgreSQL v16 with squashed patchset

This prepares PostgreSQL for compatibility with Neon's storage.
Significant changes compared to the PostgreSQL 15 patchset include:

- Backported changes for users and roles are no longer required
- Use RM_NEON_ID for changes in WAL, instead of modifying core WAL records
---
 configure                                   |  86 ++
 configure.ac                                |  13 +
 contrib/pg_prewarm/autoprewarm.c            |  36 +
 contrib/pg_prewarm/pg_prewarm.c             |  18 +-
 contrib/pg_prewarm/pg_prewarm.control       |   1 +
 contrib/pg_walinspect/pg_walinspect.c       |   4 +-
 src/Makefile.global.in                      |   1 +
 src/backend/access/brin/brin_xlog.c         |   3 +-
 src/backend/access/gin/gininsert.c          |   8 +
 src/backend/access/gin/ginxlog.c            |  19 +-
 src/backend/access/gist/gistbuild.c         |  23 +-
 src/backend/access/gist/gistvacuum.c        |  20 +
 src/backend/access/hash/hash.c              |  16 +
 src/backend/access/heap/heapam.c            | 252 ++++-
 src/backend/access/heap/heapam_handler.c    |   2 +-
 src/backend/access/heap/vacuumlazy.c        |  80 +-
 src/backend/access/heap/visibilitymap.c     |   4 +-
 src/backend/access/nbtree/README            |  44 +
 src/backend/access/nbtree/nbtinsert.c       |   2 +-
 src/backend/access/nbtree/nbtree.c          |  19 +
 src/backend/access/nbtree/nbtsearch.c       | 221 ++++-
 src/backend/access/spgist/spginsert.c       |   9 +
 src/backend/access/spgist/spgvacuum.c       |  22 +-
 src/backend/access/transam/xlog.c           | 410 +++++++-
 src/backend/access/transam/xloginsert.c     |  12 +
 src/backend/access/transam/xlogprefetcher.c |   4 +-
 src/backend/access/transam/xlogreader.c     | 121 ++-
 src/backend/access/transam/xlogrecovery.c   | 166 +++-
 src/backend/access/transam/xlogutils.c      |  19 +-
 src/backend/bootstrap/bootstrap.c           |  13 +-
 src/backend/catalog/storage.c               |  14 +-
 src/backend/commands/dbcommands.c           |  37 +-
 src/backend/commands/explain.c              |  38 +-
 src/backend/commands/extension.c            |  18 +
 src/backend/commands/sequence.c             |   6 +-
 src/backend/commands/tablecmds.c            |   2 +-
 src/backend/executor/instrument.c           |   8 +
 src/backend/executor/nodeBitmapHeapscan.c   | 239 ++---
 src/backend/main/main.c                     |  40 +
 src/backend/optimizer/path/costsize.c       |   3 +
 src/backend/replication/walreceiver.c       |   7 +
 src/backend/replication/walsender.c         |  68 +-
 src/backend/storage/buffer/buf_init.c       |   8 +
 src/backend/storage/buffer/bufmgr.c         |  60 +-
 src/backend/storage/buffer/localbuf.c       |  14 +-
 src/backend/storage/lmgr/lwlocknames.txt    |   1 +
 src/backend/storage/smgr/md.c               |   4 +-
 src/backend/storage/smgr/smgr.c             | 244 ++---
 src/backend/tcop/postgres.c                 |  10 +
 src/backend/utils/activity/wait_event.c     |   3 +
 src/backend/utils/adt/dbsize.c              |  61 +-
 src/backend/utils/cache/relcache.c          |   2 +-
 src/backend/utils/fmgr/dfmgr.c              |  78 +-
 src/backend/utils/misc/guc_tables.c         |  88 ++
 src/bin/initdb/initdb.c                     |   4 +
 src/bin/pg_waldump/pg_waldump.c             | 210 ++++-
 src/include/access/heapam.h                 |   4 +
 src/include/access/nbtree.h                 |  17 +
 src/include/access/neon_xlog.h              | 132 +++
 src/include/access/xlog.h                   |  30 +
 src/include/access/xloginsert.h             |   4 +
 src/include/access/xlogreader.h             |   7 +
 src/include/access/xlogrecovery.h           |   1 +
 src/include/access/xlogutils.h              |   6 +
 src/include/commands/explain.h              |   1 +
 src/include/executor/instrument.h           |   9 +
 src/include/fmgr.h                          |   6 +
 src/include/miscadmin.h                     |   7 +
 src/include/nodes/execnodes.h               |  16 +-
 src/include/optimizer/cost.h                |   4 +
 src/include/pg_config.h.in                  |   3 +
 src/include/pg_config_manual.h              |   2 +-
 src/include/replication/walsender.h         |  20 +
 src/include/storage/buf_internals.h         |   2 +
 src/include/storage/bufmgr.h                |   6 +-
 src/include/storage/smgr.h                  |  82 +-
 src/include/utils/rel.h                     |   2 +-
 src/include/utils/wait_event.h              |   3 +-
 src/test/regress/expected/sequence.out      |   2 +-
 src/test/regress/expected/spgist.out        |   3 +
 src/test/regress/expected/sysviews.out      |   5 +-
 src/test/regress/expected/tablespace_1.out  | 974 ++++++++++++++++++++
 src/test/regress/sql/sequence.sql           |   2 +-
 src/test/regress/sql/spgist.sql             |   3 +
 84 files changed, 3753 insertions(+), 515 deletions(-)
 create mode 100644 src/include/access/neon_xlog.h
 create mode 100644 src/test/regress/expected/tablespace_1.out

diff --git a/configure b/configure
index 82e45657b21..579573f9de9 100755
--- a/configure
+++ b/configure
@@ -711,6 +711,7 @@ with_libxml
 with_uuid
 with_readline
 with_systemd
+with_libseccomp
 with_selinux
 with_ldap
 with_krb_srvnam
@@ -861,6 +862,7 @@ with_bsd_auth
 with_ldap
 with_bonjour
 with_selinux
+with_libseccomp
 with_systemd
 with_readline
 with_libedit_preferred
@@ -1571,6 +1573,7 @@ Optional Packages:
   --with-ldap             build with LDAP support
   --with-bonjour          build with Bonjour support
   --with-selinux          build with SELinux support
+  --with-libseccomp       build with libseccomp support
   --with-systemd          build with systemd support
   --without-readline      do not use GNU Readline nor BSD Libedit for editing
   --with-libedit-preferred
@@ -8868,6 +8871,39 @@ fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_selinux" >&5
 $as_echo "$with_selinux" >&6; }
 
+#
+# libseccomp
+#
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libseccomp support" >&5
+$as_echo_n "checking whether to build with libseccomp support... " >&6; }
+
+
+
+# Check whether --with-libseccomp was given.
+if test "${with_libseccomp+set}" = set; then :
+  withval=$with_libseccomp;
+  case $withval in
+    yes)
+      :
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --with-libseccomp option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  with_libseccomp=no
+
+fi
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libseccomp" >&5
+$as_echo "$with_libseccomp" >&6; }
+
 #
 # Systemd
 #
@@ -14350,6 +14386,56 @@ else
 fi
 
 
+fi
+
+if test "$with_libseccomp" = yes ; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for seccomp_init in -lseccomp" >&5
+$as_echo_n "checking for seccomp_init in -lseccomp... " >&6; }
+if ${ac_cv_lib_seccomp_seccomp_init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lseccomp  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char seccomp_init ();
+int
+main ()
+{
+return seccomp_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_seccomp_seccomp_init=yes
+else
+  ac_cv_lib_seccomp_seccomp_init=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_seccomp_seccomp_init" >&5
+$as_echo "$ac_cv_lib_seccomp_seccomp_init" >&6; }
+if test "x$ac_cv_lib_seccomp_seccomp_init" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBSECCOMP 1
+_ACEOF
+
+  LIBS="-lseccomp $LIBS"
+
+else
+  as_fn_error $? "library 'libseccomp' is required for Seccomp BPF support" "$LINENO" 5
+fi
+
 fi
 
 # for contrib/uuid-ossp
diff --git a/configure.ac b/configure.ac
index fcea0bcab42..ad8db6a14de 100644
--- a/configure.ac
+++ b/configure.ac
@@ -972,6 +972,14 @@ PGAC_ARG_BOOL(with, selinux, no, [build with SELinux support])
 AC_SUBST(with_selinux)
 AC_MSG_RESULT([$with_selinux])
 
+#
+# libseccomp
+#
+AC_MSG_CHECKING([whether to build with libseccomp support])
+PGAC_ARG_BOOL(with, libseccomp, no, [build with libseccomp support])
+AC_SUBST(with_libseccomp)
+AC_MSG_RESULT([$with_libseccomp])
+
 #
 # Systemd
 #
@@ -1624,6 +1632,11 @@ dnl If you want to use Apple's own Bonjour code on another platform,
 dnl just add -ldns_sd to LIBS manually.
 fi
 
+if test "$with_libseccomp" = yes ; then
+  AC_CHECK_LIB(seccomp, seccomp_init, [],
+               [AC_MSG_ERROR([library 'libseccomp' is required for Seccomp BPF support])])
+fi
+
 # for contrib/uuid-ossp
 if test "$with_uuid" = bsd ; then
   AC_CHECK_HEADERS(uuid.h,
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 93835449c0e..64cf5531838 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -54,6 +54,8 @@
 #include "utils/rel.h"
 #include "utils/relfilenumbermap.h"
 #include "utils/resowner.h"
+#include "utils/spccache.h"
+
 
 #define AUTOPREWARM_FILE "autoprewarm.blocks"
 
@@ -448,10 +450,12 @@ void
 autoprewarm_database_main(Datum main_arg)
 {
 	int			pos;
+	int			io_concurrency;
 	BlockInfoRecord *block_info;
 	Relation	rel = NULL;
 	BlockNumber nblocks = 0;
 	BlockInfoRecord *old_blk = NULL;
+	BlockInfoRecord *prefetch_blk = NULL;
 	dsm_segment *seg;
 
 	/* Establish signal handlers; once that's done, unblock signals. */
@@ -498,6 +502,7 @@ autoprewarm_database_main(Datum main_arg)
 		{
 			relation_close(rel, AccessShareLock);
 			rel = NULL;
+			io_concurrency = -1;
 			CommitTransactionCommand();
 		}
 
@@ -517,6 +522,8 @@ autoprewarm_database_main(Datum main_arg)
 
 			if (!rel)
 				CommitTransactionCommand();
+			else
+				io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
 		}
 		if (!rel)
 		{
@@ -549,6 +556,35 @@ autoprewarm_database_main(Datum main_arg)
 			continue;
 		}
 
+		/* if prefetching is enabled for this relation */
+		if (io_concurrency > 0)
+		{
+			/* make prefetch_blk catch up */
+			if (blk > prefetch_blk)
+			{
+				prefetch_blk = blk;
+			}
+
+			/* now, prefetch all following blocks */
+			while (prefetch_blk <= &block_info[apw_state->prewarm_stop_idx])
+			{
+				/* unless they're of a different relfilenode */
+				if (prefetch_blk->filenumber != blk->filenumber ||
+					prefetch_blk->forknum != blk->forknum ||
+					prefetch_blk->blocknum >= nblocks)
+					break;
+
+				/* or unless they are more than io_concurrency blocks ahead */
+				if (blk + io_concurrency <= prefetch_blk)
+					break;
+
+				PrefetchBuffer(rel, prefetch_blk->forknum, prefetch_blk->blocknum);
+
+				/* continue with the next block */
+				prefetch_blk++;
+			}
+		}
+
 		/* Prewarm buffer. */
 		buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL,
 								 NULL);
diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c
index e464d0d4d2b..fd139b2cd73 100644
--- a/contrib/pg_prewarm/pg_prewarm.c
+++ b/contrib/pg_prewarm/pg_prewarm.c
@@ -18,12 +18,14 @@
 #include "access/relation.h"
 #include "fmgr.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "storage/bufmgr.h"
 #include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 PG_MODULE_MAGIC;
 
@@ -183,14 +185,26 @@ pg_prewarm(PG_FUNCTION_ARGS)
 	}
 	else if (ptype == PREWARM_BUFFER)
 	{
+		BlockNumber prefetch_block = first_block;
+		Oid			nspOid;
+		int			io_concurrency;
+
+		nspOid = rel->rd_rel->reltablespace;
+		io_concurrency = get_tablespace_maintenance_io_concurrency(nspOid);
+
 		/*
 		 * In buffer mode, we actually pull the data into shared_buffers.
 		 */
 		for (block = first_block; block <= last_block; ++block)
 		{
-			Buffer		buf;
-
+			Buffer buf;
+			BlockNumber prefetch_stop = block + Min(last_block - block + 1,
+													io_concurrency);
 			CHECK_FOR_INTERRUPTS();
+			while (prefetch_block < prefetch_stop)
+			{
+				PrefetchBuffer(rel, forkNumber, prefetch_block++);
+			}
 			buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL);
 			ReleaseBuffer(buf);
 			++blocks_done;
diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control
index 40e3add4810..d40d1a000b7 100644
--- a/contrib/pg_prewarm/pg_prewarm.control
+++ b/contrib/pg_prewarm/pg_prewarm.control
@@ -3,3 +3,4 @@ comment = 'prewarm relation data'
 default_version = '1.2'
 module_pathname = '$libdir/pg_prewarm'
 relocatable = true
+trusted = true
diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 796a74f322b..141018cfd2f 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -271,7 +271,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record,
 	{
 		DecodedBkpBlock *blk;
 		BlockNumber blkno;
-		RelFileLocator rnode;
+		RelFileLocator rlocator;
 		ForkNumber	forknum;
 		Datum		values[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
 		bool		nulls[PG_GET_WAL_BLOCK_INFO_COLS] = {0};
@@ -286,7 +286,7 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record,
 		blk = XLogRecGetBlock(record, block_id);
 
 		(void) XLogRecGetBlockTagExtended(record, block_id,
-										  &rnode, &forknum, &blkno, NULL);
+										  &rlocator, &forknum, &blkno, NULL);
 
 		/* Save block_data_len */
 		if (blk->has_data)
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index cc4dc6de91e..3db93d127fe 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -186,6 +186,7 @@ with_tcl	= @with_tcl@
 with_ssl	= @with_ssl@
 with_readline	= @with_readline@
 with_selinux	= @with_selinux@
+with_libseccomp = @with_libseccomp@
 with_systemd	= @with_systemd@
 with_gssapi	= @with_gssapi@
 with_krb_srvnam	= @with_krb_srvnam@
diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c
index 89145b68f68..188045d224a 100644
--- a/src/backend/access/brin/brin_xlog.c
+++ b/src/backend/access/brin/brin_xlog.c
@@ -69,7 +69,8 @@ brin_xlog_insert_update(XLogReaderState *record,
 	}
 
 	/* need this page's blkno to store in revmap */
-	regpgno = BufferGetBlockNumber(buffer);
+	//ZENITH XXX Don't use BufferGetBlockNumber because wal-redo doesn't pin buffer.
+	XLogRecGetBlockTag(record, 0, NULL, NULL, &regpgno);
 
 	/* insert the index item into the page */
 	if (action == BLK_NEEDS_REDO)
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 56968b95acf..4d33e986c40 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -335,6 +335,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	smgr_start_unlogged_build(index->rd_smgr);
+
 	initGinState(&buildstate.ginstate, index);
 	buildstate.indtuples = 0;
 	memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
@@ -408,6 +410,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
 	ginUpdateStats(index, &buildstate.buildStats, true);
 
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 	/*
 	 * We didn't write WAL records as we built the index, so if WAL-logging is
 	 * required, write all pages to the WAL now.
@@ -417,8 +421,12 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
+		SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+		SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM);
 	}
 
+	smgr_end_unlogged_build(index->rd_smgr);
+
 	/*
 	 * Return statistics
 	 */
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index f7c84beef8d..3cac0957fef 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -407,6 +407,7 @@ ginRedoSplit(XLogReaderState *record)
 				rootbuf;
 	bool		isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
 	bool		isRoot = (data->flags & GIN_SPLIT_ROOT) != 0;
+	XLogRedoAction action;
 
 	/*
 	 * First clear incomplete-split flag on child page if this finishes a
@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record)
 	if (!isLeaf)
 		ginRedoClearIncompleteSplit(record, 3);
 
-	if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
+	action = XLogReadBufferForRedo(record, 0, &lbuffer);
+	if (action != BLK_RESTORED && action != BLK_DONE)
 		elog(ERROR, "GIN split record did not contain a full-page image of left page");
 
-	if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED)
+	action = XLogReadBufferForRedo(record, 1, &rbuffer);
+	if (action != BLK_RESTORED && action != BLK_DONE)
 		elog(ERROR, "GIN split record did not contain a full-page image of right page");
 
 	if (isRoot)
 	{
-		if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED)
+		action = XLogReadBufferForRedo(record, 2, &rootbuf);
+		if (action != BLK_RESTORED && action != BLK_DONE)
 			elog(ERROR, "GIN split record did not contain a full-page image of root page");
-		UnlockReleaseBuffer(rootbuf);
+		if (rootbuf != InvalidBuffer)
+			UnlockReleaseBuffer(rootbuf);
 	}
 
-	UnlockReleaseBuffer(rbuffer);
-	UnlockReleaseBuffer(lbuffer);
+	if (rbuffer != InvalidBuffer)
+		UnlockReleaseBuffer(rbuffer);
+	if (lbuffer != InvalidBuffer)
+		UnlockReleaseBuffer(lbuffer);
 }
 
 /*
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 5e0c1447f92..b60cdbb627e 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -40,6 +40,7 @@
 #include "access/tableam.h"
 #include "access/xloginsert.h"
 #include "catalog/index.h"
+#include "catalog/storage.h"
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "storage/bufmgr.h"
@@ -297,6 +298,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		Buffer		buffer;
 		Page		page;
 
+		smgr_start_unlogged_build(index->rd_smgr);
+
 		/* initialize the root page */
 		buffer = gistNewBuffer(index, heap);
 		Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
@@ -329,6 +332,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			gistFreeBuildBuffers(buildstate.gfbb);
 		}
 
+		smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 		/*
 		 * We didn't write WAL records as we built the index, so if
 		 * WAL-logging is required, write all pages to the WAL now.
@@ -338,7 +343,13 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			log_newpage_range(index, MAIN_FORKNUM,
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
+			SetLastWrittenLSNForBlockRange(XactLastRecEnd,
+							  index->rd_smgr->smgr_rlocator.locator,
+							  MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+			SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM);
 		}
+
+		smgr_end_unlogged_build(index->rd_smgr);
 	}
 
 	/* okay, all heap tuples are indexed */
@@ -463,8 +474,16 @@ gist_indexsortbuild(GISTBuildState *state)
 	smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
 			  levelstate->pages[0], true);
 	if (RelationNeedsWAL(state->indexrel))
-		log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO,
-					levelstate->pages[0], true);
+	{
+		XLogRecPtr lsn;
+
+		lsn = log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO,
+						  levelstate->pages[0], true);
+
+		SetLastWrittenLSNForBlock(lsn, state->indexrel->rd_smgr->smgr_rlocator.locator,
+								  MAIN_FORKNUM, GIST_ROOT_BLKNO);
+		SetLastWrittenLSNForRelation(lsn, state->indexrel->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM);
+	}
 
 	pfree(levelstate->pages[0]);
 	pfree(levelstate);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 3f60d3274d2..5e1bc1ee193 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -23,6 +23,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "utils/memutils.h"
+#include "utils/spccache.h"
 
 /* Working state needed by gistbulkdelete */
 typedef struct
@@ -130,8 +131,14 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	BlockNumber num_pages;
 	bool		needLock;
 	BlockNumber blkno;
+	BlockNumber prefetch_blkno;
+	int			io_concurrency;
 	MemoryContext oldctx;
 
+	io_concurrency = get_tablespace_maintenance_io_concurrency(
+		rel->rd_rel->reltablespace
+	);
+
 	/*
 	 * Reset fields that track information about the entire index now.  This
 	 * avoids double-counting in the case where a single VACUUM command
@@ -209,6 +216,7 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	needLock = !RELATION_IS_LOCAL(rel);
 
 	blkno = GIST_ROOT_BLKNO;
+	prefetch_blkno = blkno;
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -221,9 +229,21 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
 			break;
+
+		if (prefetch_blkno < blkno)
+			prefetch_blkno = blkno;
+		for (; prefetch_blkno < num_pages &&
+			   prefetch_blkno < blkno + io_concurrency; prefetch_blkno++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno);
+
 		/* Iterate over pages, then loop back to recheck length */
 		for (; blkno < num_pages; blkno++)
+		{
+			if (io_concurrency > 0 && prefetch_blkno < num_pages)
+				PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++);
+
 			gistvacuumpage(&vstate, blkno, blkno);
+		}
 	}
 
 	/*
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index fc5d97f606e..35fe72f6dbc 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -32,6 +32,7 @@
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 /* Working state for hashbuild and its callback */
 typedef struct
@@ -467,13 +468,17 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Bucket		orig_maxbucket;
 	Bucket		cur_maxbucket;
 	Bucket		cur_bucket;
+	Bucket		prf_bucket;
 	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
 	HashMetaPage cachedmetap;
+	int			io_concurrency;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
 
+	io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
+
 	/*
 	 * We need a copy of the metapage so that we can use its hashm_spares[]
 	 * values to compute bucket page addresses, but a cached copy should be
@@ -488,9 +493,14 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 	/* Scan the buckets that we know exist */
 	cur_bucket = 0;
+	prf_bucket = cur_bucket;
 	cur_maxbucket = orig_maxbucket;
 
 loop_top:
+	for (; prf_bucket <= cur_maxbucket &&
+		   prf_bucket < cur_bucket + io_concurrency; prf_bucket++)
+		PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket));
+
 	while (cur_bucket <= cur_maxbucket)
 	{
 		BlockNumber bucket_blkno;
@@ -501,6 +511,12 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		Page		page;
 		bool		split_cleanup = false;
 
+		if (io_concurrency > 0 && prf_bucket <= cur_maxbucket)
+		{
+			PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket));
+			prf_bucket++;
+		}
+
 		/* Get address of bucket's start page */
 		bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index ed0f612ab3e..813c199f300 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -54,6 +54,7 @@
 #include "catalog/catalog.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "port/pg_bitutils.h"
@@ -71,6 +72,7 @@
 #include "utils/relcache.h"
 #include "utils/snapmgr.h"
 #include "utils/spccache.h"
+#include "access/neon_xlog.h"
 
 
 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
@@ -320,6 +322,27 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
 		scan->rs_startblock = 0;
 	}
 
+	if (enable_seqscan_prefetch)
+	{
+		/*
+		 * Do not use tablespace setting for catalog scans, as we might have
+		 * the tablespace settings in the catalogs locked already, which
+		 * might result in a deadlock.
+		 */
+		if (IsCatalogRelation(scan->rs_base.rs_rd))
+			scan->rs_prefetch_maximum = effective_io_concurrency;
+		else
+			scan->rs_prefetch_maximum =
+				get_tablespace_io_concurrency(scan->rs_base.rs_rd->rd_rel->reltablespace);
+
+		scan->rs_prefetch_target = 1;
+	}
+	else
+	{
+		scan->rs_prefetch_maximum = -1;
+		scan->rs_prefetch_target = -1;
+	}
+
 	scan->rs_numblocks = InvalidBlockNumber;
 	scan->rs_inited = false;
 	scan->rs_ctup.t_data = NULL;
@@ -401,6 +424,103 @@ heapgetpage(TableScanDesc sscan, BlockNumber block)
 	 */
 	CHECK_FOR_INTERRUPTS();
 
+	/* Prefetch up to io_concurrency blocks ahead */
+	if (scan->rs_prefetch_maximum > 0 && scan->rs_nblocks > 1)
+	{
+		int64	nblocks;
+		int64	rel_scan_start;
+		int64	rel_scan_end; /* blockno of end of scan (mod scan->rs_nblocks) */
+		int64	scan_pageoff; /* page, but adjusted for scan position as above */
+
+		int64	prefetch_start; /* start block of prefetch requests this iteration */
+		int64	prefetch_end; /* end block of prefetch requests this iteration, if applicable */
+		ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata;
+		ParallelBlockTableScanDesc pbscandesc = (ParallelBlockTableScanDesc) sscan->rs_parallel;
+
+		/*
+		 * Parallel scans look like repeated sequential table scans for
+		 * prefetching; with a scan start at nalloc + ch_remaining - ch_size
+		 */
+		if (pbscanwork != NULL)
+		{
+			uint64	start_offset,
+					end_offset;
+
+			Assert(pbscandesc != NULL);
+			start_offset = pbscanwork->phsw_nallocated
+						   + pbscanwork->phsw_chunk_remaining + 1
+						   - pbscanwork->phsw_chunk_size;
+			end_offset = Min(pbscanwork->phsw_nallocated +
+							 pbscanwork->phsw_chunk_remaining + 1,
+							 pbscandesc->phs_nblocks);
+
+			rel_scan_start = (int64) (pbscandesc->phs_startblock) + start_offset;
+			rel_scan_end = (int64) (pbscandesc->phs_startblock) + end_offset;
+			nblocks = pbscandesc->phs_nblocks;
+		}
+		else
+		{
+			rel_scan_start = scan->rs_startblock;
+			rel_scan_end = scan->rs_startblock + scan->rs_nblocks;
+			nblocks = scan->rs_nblocks;
+		}
+
+		prefetch_end = rel_scan_end;
+
+		if ((uint64) block < rel_scan_start)
+			scan_pageoff = block + nblocks;
+		else
+			scan_pageoff = block;
+
+		Assert(rel_scan_start <= scan_pageoff && scan_pageoff <= rel_scan_end);
+
+		/*
+		 * If this is the first page of this seqscan, initiate prefetch of
+		 * pages page..page + n. On each subsequent call, prefetch the next
+		 * page that we haven't prefetched yet, at page + n.
+		 * If this is the last page of the prefetch, 
+		 */
+		if (rel_scan_start != block)
+		{
+			prefetch_start = scan_pageoff + (int64) scan->rs_prefetch_target - 1;
+			prefetch_end = prefetch_start + 1;
+		}
+		else
+		{
+			prefetch_start = scan_pageoff;
+			prefetch_end = rel_scan_end;
+		}
+
+		/* do not prefetch if the only page we're trying to prefetch is past the end of our scan window */
+		if (prefetch_start > rel_scan_end)
+			prefetch_end = 0;
+
+		if (prefetch_end > prefetch_start + scan->rs_prefetch_target)
+			prefetch_end = prefetch_start + scan->rs_prefetch_target;
+
+		if (prefetch_end > rel_scan_end)
+			prefetch_end = rel_scan_end;
+
+		while (prefetch_start < prefetch_end)
+		{
+			BlockNumber blckno = (prefetch_start % nblocks);
+			Assert(blckno < nblocks);
+			Assert(blckno < INT_MAX);
+			PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, blckno);
+			prefetch_start += 1;
+		}
+
+		/*
+		 * Use exponential growth of readahead up to prefetch_maximum, to
+		 * make sure that a low LIMIT does not result in high IO overhead,
+		 * but operations in general are still very fast.
+		 */
+		if (scan->rs_prefetch_target < scan->rs_prefetch_maximum / 2)
+			scan->rs_prefetch_target *= 2;
+		else if (scan->rs_prefetch_target < scan->rs_prefetch_maximum)
+			scan->rs_prefetch_target = scan->rs_prefetch_maximum;
+	}
+
 	/* read page using selected strategy */
 	scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, block,
 									   RBM_NORMAL, scan->rs_strategy);
@@ -1912,11 +2032,11 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	/* XLOG stuff */
 	if (RelationNeedsWAL(relation))
 	{
-		xl_heap_insert xlrec;
-		xl_heap_header xlhdr;
+		xl_neon_heap_insert xlrec;
+		xl_neon_heap_header xlhdr;
 		XLogRecPtr	recptr;
 		Page		page = BufferGetPage(buffer);
-		uint8		info = XLOG_HEAP_INSERT;
+		uint8		info = XLOG_NEON_HEAP_INSERT;
 		int			bufflags = 0;
 
 		/*
@@ -1934,7 +2054,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
 			PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
 		{
-			info |= XLOG_HEAP_INIT_PAGE;
+			info |= XLOG_NEON_INIT_PAGE;
 			bufflags |= REGBUF_WILL_INIT;
 		}
 
@@ -1962,11 +2082,12 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		}
 
 		XLogBeginInsert();
-		XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
+		XLogRegisterData((char *) &xlrec, SizeOfNeonHeapInsert);
 
 		xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
 		xlhdr.t_infomask = heaptup->t_data->t_infomask;
 		xlhdr.t_hoff = heaptup->t_data->t_hoff;
+		xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data);
 
 		/*
 		 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
@@ -1974,7 +2095,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		 * xl_heap_header in the xlog.
 		 */
 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
-		XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
+		XLogRegisterBufData(0, (char *) &xlhdr, SizeOfNeonHeapHeader);
 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
 		XLogRegisterBufData(0,
 							(char *) heaptup->t_data + SizeofHeapTupleHeader,
@@ -1983,14 +2104,25 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		/* filtering by origin on a row level is much more efficient */
 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
-		recptr = XLogInsert(RM_HEAP_ID, info);
+		recptr = XLogInsert(RM_NEON_ID, info);
 
 		PageSetLSN(page, recptr);
 	}
 
 	END_CRIT_SECTION();
 
-	UnlockReleaseBuffer(buffer);
+	if (options & HEAP_INSERT_SPECULATIVE)
+	{
+		/*
+		 * NEON: speculative token is not stored in WAL, so if the page is evicted
+		 * from the buffer cache, the token will be lost. To prevent that, we keep the
+		 * buffer pinned. It will be unpinned in heapam_tuple_finish/abort_speculative.
+		 */
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+	}
+	else
+		UnlockReleaseBuffer(buffer);
+
 	if (vmbuffer != InvalidBuffer)
 		ReleaseBuffer(vmbuffer);
 
@@ -2274,8 +2406,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 		if (needwal)
 		{
 			XLogRecPtr	recptr;
-			xl_heap_multi_insert *xlrec;
-			uint8		info = XLOG_HEAP2_MULTI_INSERT;
+			xl_neon_heap_multi_insert *xlrec;
+			uint8		info = XLOG_NEON_HEAP_MULTI_INSERT;
 			char	   *tupledata;
 			int			totaldatalen;
 			char	   *scratchptr = scratch.data;
@@ -2288,9 +2420,9 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 			 */
 			init = starting_with_empty_page;
 
-			/* allocate xl_heap_multi_insert struct from the scratch area */
-			xlrec = (xl_heap_multi_insert *) scratchptr;
-			scratchptr += SizeOfHeapMultiInsert;
+			/* allocate xl_neon_heap_multi_insert struct from the scratch area */
+			xlrec = (xl_neon_heap_multi_insert *) scratchptr;
+			scratchptr += SizeOfNeonHeapMultiInsert;
 
 			/*
 			 * Allocate offsets array. Unless we're reinitializing the page,
@@ -2322,17 +2454,25 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 			for (i = 0; i < nthispage; i++)
 			{
 				HeapTuple	heaptup = heaptuples[ndone + i];
-				xl_multi_insert_tuple *tuphdr;
+				xl_neon_multi_insert_tuple *tuphdr;
 				int			datalen;
 
 				if (!init)
 					xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
 				/* xl_multi_insert_tuple needs two-byte alignment. */
-				tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
-				scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
+				tuphdr = (xl_neon_multi_insert_tuple *) SHORTALIGN(scratchptr);
+				scratchptr = ((char *) tuphdr) + SizeOfNeonMultiInsertTuple;
 
 				tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
 				tuphdr->t_infomask = heaptup->t_data->t_infomask;
+				if (i == 0)
+				{
+					xlrec->t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data);
+				}
+				else
+				{
+					Assert(xlrec->t_cid == HeapTupleHeaderGetRawCommandId(heaptup->t_data));
+				}
 				tuphdr->t_hoff = heaptup->t_data->t_hoff;
 
 				/* write bitmap [+ padding] [+ oid] + data */
@@ -2359,7 +2499,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 
 			if (init)
 			{
-				info |= XLOG_HEAP_INIT_PAGE;
+				info |= XLOG_NEON_INIT_PAGE;
 				bufflags |= REGBUF_WILL_INIT;
 			}
 
@@ -2379,7 +2519,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 			/* filtering by origin on a row level is much more efficient */
 			XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
-			recptr = XLogInsert(RM_HEAP2_ID, info);
+			recptr = XLogInsert(RM_NEON_ID, info);
 
 			PageSetLSN(page, recptr);
 		}
@@ -2826,8 +2966,8 @@ heap_delete(Relation relation, ItemPointer tid,
 	 */
 	if (RelationNeedsWAL(relation))
 	{
-		xl_heap_delete xlrec;
-		xl_heap_header xlhdr;
+		xl_neon_heap_delete xlrec;
+		xl_neon_heap_header xlhdr;
 		XLogRecPtr	recptr;
 
 		/*
@@ -2846,6 +2986,7 @@ heap_delete(Relation relation, ItemPointer tid,
 											  tp.t_data->t_infomask2);
 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
 		xlrec.xmax = new_xmax;
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data);
 
 		if (old_key_tuple != NULL)
 		{
@@ -2856,7 +2997,7 @@ heap_delete(Relation relation, ItemPointer tid,
 		}
 
 		XLogBeginInsert();
-		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
+		XLogRegisterData((char *) &xlrec, SizeOfNeonHeapDelete);
 
 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
 
@@ -2867,9 +3008,10 @@ heap_delete(Relation relation, ItemPointer tid,
 		{
 			xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
 			xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
+			xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data);
 			xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
 
-			XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
+			XLogRegisterData((char *) &xlhdr, SizeOfNeonHeapHeader);
 			XLogRegisterData((char *) old_key_tuple->t_data
 							 + SizeofHeapTupleHeader,
 							 old_key_tuple->t_len
@@ -2879,7 +3021,7 @@ heap_delete(Relation relation, ItemPointer tid,
 		/* filtering by origin on a row level is much more efficient */
 		XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
-		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
+		recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_DELETE);
 
 		PageSetLSN(page, recptr);
 	}
@@ -3572,7 +3714,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 
 		if (RelationNeedsWAL(relation))
 		{
-			xl_heap_lock xlrec;
+			xl_neon_heap_lock xlrec;
 			XLogRecPtr	recptr;
 
 			XLogBeginInsert();
@@ -3584,8 +3726,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 												  oldtup.t_data->t_infomask2);
 			xlrec.flags =
 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
-			XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
-			recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
+			xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data);
+			XLogRegisterData((char *) &xlrec, SizeOfNeonHeapLock);
+			recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_LOCK);
 			PageSetLSN(page, recptr);
 		}
 
@@ -4782,7 +4925,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple,
 	 */
 	if (RelationNeedsWAL(relation))
 	{
-		xl_heap_lock xlrec;
+		xl_neon_heap_lock xlrec;
 		XLogRecPtr	recptr;
 
 		XLogBeginInsert();
@@ -4793,11 +4936,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple,
 		xlrec.infobits_set = compute_infobits(new_infomask,
 											  tuple->t_data->t_infomask2);
 		xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data);
 		XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
 
 		/* we don't decode row locks atm, so no need to log the origin */
 
-		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
+		recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_LOCK);
 
 		PageSetLSN(page, recptr);
 	}
@@ -5709,6 +5853,7 @@ heap_finish_speculative(Relation relation, ItemPointer tid)
 
 	END_CRIT_SECTION();
 
+	ReleaseBuffer(buffer); /* NEON: release buffer pinned by heap_insert */
 	UnlockReleaseBuffer(buffer);
 }
 
@@ -5781,6 +5926,16 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
 		elog(ERROR, "attempted to kill a non-speculative tuple");
 	Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
 
+	/*
+	 * NEON: release buffer pinned by heap_insert
+	 *
+	 * This function is also used on the toast tuples of an aborted speculative
+	 * insertion. For those, there is no token on the tuple, and we didn' t keep
+	 * the pin. 
+	 */
+	if (HeapTupleHeaderIsSpeculative(tp.t_data))
+		ReleaseBuffer(buffer);  
+
 	/*
 	 * No need to check for serializable conflicts here.  There is never a
 	 * need for a combo CID, either.  No need to extract replica identity, or
@@ -5830,7 +5985,7 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
 	 */
 	if (RelationNeedsWAL(relation))
 	{
-		xl_heap_delete xlrec;
+		xl_neon_heap_delete xlrec;
 		XLogRecPtr	recptr;
 
 		xlrec.flags = XLH_DELETE_IS_SUPER;
@@ -5838,14 +5993,15 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
 											  tp.t_data->t_infomask2);
 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
 		xlrec.xmax = xid;
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data);
 
 		XLogBeginInsert();
-		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
+		XLogRegisterData((char *) &xlrec, SizeOfNeonHeapDelete);
 		XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
 
 		/* No replica identity & replication origin logged */
 
-		recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
+		recptr = XLogInsert(RM_NEON_ID, XLOG_NEON_HEAP_DELETE);
 
 		PageSetLSN(page, recptr);
 	}
@@ -8372,9 +8528,9 @@ log_heap_update(Relation reln, Buffer oldbuf,
 				HeapTuple old_key_tuple,
 				bool all_visible_cleared, bool new_all_visible_cleared)
 {
-	xl_heap_update xlrec;
-	xl_heap_header xlhdr;
-	xl_heap_header xlhdr_idx;
+	xl_neon_heap_update xlrec;
+	xl_neon_heap_header xlhdr;
+	xl_neon_heap_header xlhdr_idx;
 	uint8		info;
 	uint16		prefix_suffix[2];
 	uint16		prefixlen = 0,
@@ -8391,9 +8547,9 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	XLogBeginInsert();
 
 	if (HeapTupleIsHeapOnly(newtup))
-		info = XLOG_HEAP_HOT_UPDATE;
+		info = XLOG_NEON_HEAP_HOT_UPDATE;
 	else
-		info = XLOG_HEAP_UPDATE;
+		info = XLOG_NEON_HEAP_UPDATE;
 
 	/*
 	 * If the old and new tuple are on the same page, we only need to log the
@@ -8473,7 +8629,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
 		PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
 	{
-		info |= XLOG_HEAP_INIT_PAGE;
+		info |= XLOG_NEON_INIT_PAGE;
 		init = true;
 	}
 	else
@@ -8488,6 +8644,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	/* Prepare WAL data for the new page */
 	xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
 	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
+	xlrec.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data);
 
 	bufflags = REGBUF_STANDARD;
 	if (init)
@@ -8499,7 +8656,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	if (oldbuf != newbuf)
 		XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
 
-	XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
+	XLogRegisterData((char *) &xlrec, SizeOfNeonHeapUpdate);
 
 	/*
 	 * Prepare WAL data for the new tuple.
@@ -8525,6 +8682,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
 	xlhdr.t_infomask = newtup->t_data->t_infomask;
 	xlhdr.t_hoff = newtup->t_data->t_hoff;
+	xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data);
 	Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
 
 	/*
@@ -8532,7 +8690,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	 *
 	 * The 'data' doesn't include the common prefix or suffix.
 	 */
-	XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
+	XLogRegisterBufData(0, (char *) &xlhdr, SizeOfNeonHeapHeader);
 	if (prefixlen == 0)
 	{
 		XLogRegisterBufData(0,
@@ -8566,8 +8724,9 @@ log_heap_update(Relation reln, Buffer oldbuf,
 		xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
 		xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
 		xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
+		xlhdr_idx.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data);
 
-		XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
+		XLogRegisterData((char *) &xlhdr_idx, SizeOfNeonHeapHeader);
 
 		/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
 		XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
@@ -8577,7 +8736,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	/* filtering by origin on a row level is much more efficient */
 	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
 
-	recptr = XLogInsert(RM_HEAP_ID, info);
+	recptr = XLogInsert(RM_NEON_ID, info);
 
 	return recptr;
 }
@@ -8975,7 +9134,16 @@ heap_xlog_visible(XLogReaderState *record)
 
 		PageSetAllVisible(page);
 
-		if (XLogHintBitIsNeeded())
+		/*
+		 * NEON: despite to the comment above we need to update page LSN here.
+		 * See discussion at hackers: https://www.postgresql.org/message-id/flat/039076d4f6cdd871691686361f83cb8a6913a86a.camel%40j-davis.com#101ba42b004f9988e3d54fce26fb3462
+		 * For Neon this assignment is critical because otherwise last written LSN tracked at compute doesn't
+		 * match with page LSN assignee by WAL-redo and as a result, prefetched page is rejected.
+		 *
+		 * It is fixed in upstream in https://github.com/neondatabase/postgres/commit/7bf713dd2d0739fbcd4103971ed69c17ebe677ea
+		 * but until it is merged we still need to carry a patch here.
+		 */
+		if (true || XLogHintBitIsNeeded())
 			PageSetLSN(page, lsn);
 
 		MarkBufferDirty(buffer);
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 5a17112c91e..f1fe3b33071 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -637,7 +637,7 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
 {
 	SMgrRelation dstrel;
 
-	dstrel = smgropen(*newrlocator, rel->rd_backend);
+	dstrel = smgropen(*newrlocator, rel->rd_backend, rel->rd_rel->relpersistence);
 
 	/*
 	 * Since we copy the file directly without looking at the shared buffers,
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 4eb953f9047..05f0d8b23eb 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -52,6 +52,7 @@
 #include "commands/vacuum.h"
 #include "executor/instrument.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "optimizer/paths.h"
 #include "pgstat.h"
 #include "portability/instr_time.h"
@@ -64,6 +65,7 @@
 #include "utils/memutils.h"
 #include "utils/pg_rusage.h"
 #include "utils/timestamp.h"
+#include "utils/spccache.h"
 
 
 /*
@@ -145,6 +147,9 @@ typedef struct LVRelState
 	Relation   *indrels;
 	int			nindexes;
 
+	/* prefetch */
+	int			io_concurrency;
+
 	/* Buffer access strategy and parallel vacuum state */
 	BufferAccessStrategy bstrategy;
 	ParallelVacuumState *pvs;
@@ -366,6 +371,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 
 	/* Set up high level stuff about rel and its indexes */
 	vacrel->rel = rel;
+	vacrel->io_concurrency =
+		get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
 	vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes,
 					 &vacrel->indrels);
 	vacrel->bstrategy = bstrategy;
@@ -827,6 +834,7 @@ lazy_scan_heap(LVRelState *vacrel)
 	BlockNumber rel_pages = vacrel->rel_pages,
 				blkno,
 				next_unskippable_block,
+				next_prefetch_block,
 				next_fsm_block_to_vacuum = 0;
 	VacDeadItems *dead_items = vacrel->dead_items;
 	Buffer		vmbuffer = InvalidBuffer;
@@ -849,6 +857,7 @@ lazy_scan_heap(LVRelState *vacrel)
 	next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0,
 											&next_unskippable_allvis,
 											&skipping_current_range);
+	next_prefetch_block = 0;
 	for (blkno = 0; blkno < rel_pages; blkno++)
 	{
 		Buffer		buf;
@@ -941,6 +950,33 @@ lazy_scan_heap(LVRelState *vacrel)
 										 PROGRESS_VACUUM_PHASE_SCAN_HEAP);
 		}
 
+		if (vacrel->io_concurrency > 0)
+		{
+			/*
+			 * Prefetch io_concurrency blocks ahead
+			 */
+			uint32 prefetch_budget = vacrel->io_concurrency;
+
+			/* never trail behind the current scan */
+			if (next_prefetch_block < blkno)
+				next_prefetch_block = blkno;
+
+			/* but only up to the end of the relation */
+			if (prefetch_budget > rel_pages - next_prefetch_block)
+				prefetch_budget = rel_pages - next_prefetch_block;
+
+			/* And only up to io_concurrency ahead of the current vacuum scan */
+			if (next_prefetch_block + prefetch_budget > blkno + vacrel->io_concurrency)
+				prefetch_budget = blkno + vacrel->io_concurrency - next_prefetch_block;
+
+			/* And only up to the next unskippable block */
+			if (next_prefetch_block + prefetch_budget > next_unskippable_block)
+				prefetch_budget = next_unskippable_block - next_prefetch_block;
+
+			for (; prefetch_budget-- > 0; next_prefetch_block++)
+				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, next_prefetch_block);
+		}
+
 		/*
 		 * Pin the visibility map page in case we need to mark the page
 		 * all-visible.  In most cases this will be very cheap, because we'll
@@ -2413,7 +2449,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel)
 static void
 lazy_vacuum_heap_rel(LVRelState *vacrel)
 {
-	int			index = 0;
+	int			index = 0,
+				pindex = 0;
 	BlockNumber vacuumed_pages = 0;
 	Buffer		vmbuffer = InvalidBuffer;
 	LVSavedErrInfo saved_err_info;
@@ -2443,6 +2480,47 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 		blkno = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]);
 		vacrel->blkno = blkno;
 
+		if (vacrel->io_concurrency > 0)
+		{
+			/*
+			 * If we're just starting out, prefetch N consecutive blocks.
+			 * If not, only the next 1 block
+			 */
+			if (pindex == 0)
+			{
+				int prefetch_budget = Min(vacrel->dead_items->num_items,
+										  Min(vacrel->rel_pages,
+											  vacrel->io_concurrency));
+				BlockNumber prev_prefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]);
+				PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch);
+
+				while (++pindex < vacrel->dead_items->num_items &&
+					   prefetch_budget > 0)
+				{
+					ItemPointer ptr = &vacrel->dead_items->items[pindex];
+					if (ItemPointerGetBlockNumber(ptr) != prev_prefetch)
+					{
+						prev_prefetch = ItemPointerGetBlockNumber(ptr);
+						prefetch_budget -= 1;
+						PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch);
+					}
+				}
+			}
+			else if (pindex < vacrel->dead_items->num_items)
+			{
+				BlockNumber previous = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]);
+				while (++pindex < vacrel->dead_items->num_items)
+				{
+					BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]);
+					if (previous != toPrefetch)
+					{
+						PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch);
+						break;
+					}
+				}
+			}
+		}
+
 		/*
 		 * Pin the visibility map page in case we need to mark the page
 		 * all-visible.  In most cases this will be very cheap, because we'll
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 2e18cd88bcf..b3e2dfe4dfa 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -299,7 +299,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
 				 * WAL record inserted above, so it would be incorrect to
 				 * update the heap page's LSN.
 				 */
-				if (XLogHintBitIsNeeded())
+				/* NEON: we have to update page LSN even if  wal_log_hints=off 
+ 				if (XLogHintBitIsNeeded())
+				*/
 				{
 					Page		heapPage = BufferGetPage(heapBuf);
 
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 52e646c7f75..bf63b519924 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1081,3 +1081,47 @@ item is irrelevant, and need not be stored at all.  This arrangement
 corresponds to the fact that an L&Y non-leaf page has one more pointer
 than key.  Suffix truncation's negative infinity attributes behave in
 the same way.
+
+Notes About Index Scan Prefetch
+-------------------------------
+
+Prefetch can significantly improve the speed of OLAP queries.
+To be able to perform prefetch, we need to know which pages will
+be accessed during the scan. It is trivial for heap- and bitmap scans,
+but requires more effort for index scans: to implement prefetch for
+index scans, we need to find out subsequent leaf pages. 
+
+Postgres links all pages at the same level of the B-Tree in a doubly linked list and uses this list for
+forward and backward iteration. This list, however, can not trivially be used for prefetching because to locate the next page because we need first to load the current page. To prefetch more than only the next page, we can utilize the parent page's downlinks instead, as it contains references to most of the target page's sibling pages.
+
+Because Postgres' nbtree pages have no reference to their parent page, we need to remember the parent page when descending the btree and use it to prefetch subsequent pages. We will utilize the parent's linked list to improve the performance of this prefetch system past the key range of the parent page.
+
+We should prefetch not only leaf pages, but also the next parent page.
+The trick is to correctly calculate the moment when it will be needed:
+We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page.
+
+Currently there are two different prefetch implementations for
+index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches
+only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only
+if parallel plan is not used. Parallel index scan is using critical section for obtaining next
+page by parallel worker. Leaf page is loaded in this critical section.
+And if most of time is spent in loading the page, then it actually eliminates any concurrency
+and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in
+any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0.
+
+Prefetch for normal (not index-only) index tries to prefetch heap tuples
+referenced from leaf page. Average number of items per page
+is about 100 which is comparable with default value of effective_io_concurrency.
+So there is not so much sense trying to prefetch also next leaf page.
+
+As far as it is difficult to estimate number of entries traversed by index scan,
+we prefer not to prefetch  large number of pages from the very beginning.
+Such useless prefetch can reduce the performance of point lookups.
+Instead of it we start with smallest prefetch distance and increase it
+by INCREASE_PREFETCH_DISTANCE_STEP after processing each item
+until it reaches effective_io_concurrency. In case of index-only
+scan we increase prefetch distance after processing each leaf pages
+and for index scan - after processing each tuple.
+The only exception is case when no key bounds are specified.
+In this case we traverse the whole relation and it makes sense
+to start with the largest possible prefetch distance from the very beginning.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index d33f814a938..358f8cd59d0 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -2165,7 +2165,7 @@ _bt_insert_parent(Relation rel,
 					 BlockNumberIsValid(RelationGetTargetBlock(rel))));
 
 			/* Find the leftmost page at the next level up */
-			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
+			pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL, NULL);
 			/* Set up a phony stack entry pointing there */
 			stack = &fakestack;
 			stack->bts_blkno = BufferGetBlockNumber(pbuf);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 6c5b5c69ce5..b939b9b6947 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -37,6 +37,7 @@
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
+#include "utils/spccache.h"
 
 
 /*
@@ -371,6 +372,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
+	so->prefetch_maximum = 0;   /* disable prefetch */
 
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
@@ -912,6 +914,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	BTVacState	vstate;
 	BlockNumber num_pages;
 	BlockNumber scanblkno;
+	BlockNumber prefetch_blkno;
+	int			io_concurrency;
 	bool		needLock;
 
 	/*
@@ -951,6 +955,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	vstate.maxbufsize = 0;
 	vstate.pendingpages = NULL;
 	vstate.npendingpages = 0;
+
+	io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
+
 	/* Consider applying _bt_pendingfsm_finalize optimization */
 	_bt_pendingfsm_init(rel, &vstate, (callback == NULL));
 
@@ -982,6 +989,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	needLock = !RELATION_IS_LOCAL(rel);
 
 	scanblkno = BTREE_METAPAGE + 1;
+	prefetch_blkno = scanblkno;
+
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -998,9 +1007,19 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		/* Quit if we've scanned the whole relation */
 		if (scanblkno >= num_pages)
 			break;
+
+		if (prefetch_blkno < scanblkno)
+			prefetch_blkno = scanblkno;
+		for (; prefetch_blkno < num_pages &&
+			   prefetch_blkno < scanblkno + io_concurrency; prefetch_blkno++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno);
+
 		/* Iterate over pages, then loop back to recheck length */
 		for (; scanblkno < num_pages; scanblkno++)
 		{
+			if (io_concurrency > 0 && prefetch_blkno < num_pages)
+				PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++);
+
 			btvacuumpage(&vstate, scanblkno);
 			if (info->report_progress)
 				pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 3230b3b8940..cefb363acfe 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -18,11 +18,14 @@
 #include "access/nbtree.h"
 #include "access/relscan.h"
 #include "access/xact.h"
+#include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "pgstat.h"
 #include "storage/predicate.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
+#include "utils/spccache.h"
 
 
 static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
@@ -47,6 +50,7 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot);
 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
+#define INCREASE_PREFETCH_DISTANCE_STEP 1
 
 /*
  *	_bt_drop_lock_and_maybe_pin()
@@ -847,6 +851,70 @@ _bt_compare(Relation rel,
 	return 0;
 }
 
+
+/*
+ * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch.
+ * This functions returns offset of first item.
+ */
+static int
+_bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir)
+{
+	Relation rel = scan->indexRelation;
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	Buffer		buf;
+	Page		page;
+	BTPageOpaque opaque;
+	OffsetNumber offnum;
+	OffsetNumber n_child;
+	int			next_parent_prefetch_index;
+	int			i, j;
+
+	buf = _bt_getbuf(rel, parent, BT_READ);
+	page = BufferGetPage(buf);
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	offnum = P_FIRSTDATAKEY(opaque);
+	n_child = PageGetMaxOffsetNumber(page) - offnum + 1;
+
+	/* Position where we should insert prefetch of parent page: we intentionally use prefetch_maximum here instead of current_prefetch_distance,
+	 * assuming that it will reach prefetch_maximum before we reach and of the parent page
+	 */
+	next_parent_prefetch_index = (n_child > so->prefetch_maximum)
+		? n_child - so->prefetch_maximum : 0;
+
+	if (ScanDirectionIsForward(dir))
+	{
+		so->next_parent = opaque->btpo_next;
+		if (so->next_parent == P_NONE)
+			next_parent_prefetch_index = -1;
+		for (i = 0, j = 0; i < n_child; i++)
+		{
+			ItemId itemid = PageGetItemId(page, offnum + i);
+			IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+			if (i == next_parent_prefetch_index)
+				so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */
+ 			so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup);
+		}
+	}
+	else
+	{
+		so->next_parent = opaque->btpo_prev;
+		if (so->next_parent == P_NONE)
+			next_parent_prefetch_index = -1;
+		for (i = 0, j = 0; i < n_child; i++)
+		{
+			ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1);
+			IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
+			if (i == next_parent_prefetch_index)
+				so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */
+			so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup);
+		}
+	}
+	so->n_prefetch_blocks = j;
+	so->last_prefetch_index = 0;
+	_bt_relbuf(rel, buf);
+	return offnum;
+}
+
 /*
  *	_bt_first() -- Find the first item in a scan.
  *
@@ -1106,6 +1174,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 	}
 
+	/* Neon: initialize prefetch */
+	so->n_prefetch_requests = 0;
+	so->n_prefetch_blocks = 0;
+	so->last_prefetch_index = 0;
+	so->next_parent = P_NONE;
+	so->prefetch_maximum = IsCatalogRelation(rel)
+		? effective_io_concurrency
+		: get_tablespace_io_concurrency(rel->rd_rel->reltablespace);
+
+	if (scan->xs_want_itup) /* index only scan */
+	{
+		if (enable_indexonlyscan_prefetch)
+		{
+			/* We disable prefetch for parallel index-only scan.
+			 * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker
+			 * which issued prefetch request. The logic of splitting pages between parallel workers in
+			 * index scan doesn't allow to satisfy this requirement.
+			 * Also prefetch of leave pages will be useless if expected number of rows fits in one page.
+			 */
+			if (scan->parallel_scan)
+				so->prefetch_maximum = 0;  /* disable prefetch */
+		}
+		else
+			so->prefetch_maximum = 0; /* disable prefetch */
+	}
+	else if (!enable_indexscan_prefetch || !scan->heapRelation)
+		so->prefetch_maximum = 0; /* disable prefetch */
+
+	/* If key bounds are not specified, then we will scan the whole relation and it make sense to start with the largest possible prefetch distance */
+	so->current_prefetch_distance = (keysCount == 0) ? so->prefetch_maximum : 0;
+
 	/*
 	 * If we found no usable boundary keys, we have to start from one end of
 	 * the tree.  Walk down that edge to the first or last key, and scan from
@@ -1376,6 +1475,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 */
 	stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ, scan->xs_snapshot);
 
+	/* Start prefetching for index only scan */
+	if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */
+	{
+		int first_offset = _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir);
+		int skip = ScanDirectionIsForward(dir)
+			? stack->bts_offset - first_offset
+			: first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset;
+		Assert(so->n_prefetch_blocks >= skip);
+		so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP;
+		so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip);
+		so->last_prefetch_index = skip + so->n_prefetch_requests;
+		for (int j = skip; j < so->last_prefetch_index; j++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[j]);
+	}
+
 	/* don't need to keep the stack around... */
 	_bt_freestack(stack);
 
@@ -1515,8 +1629,64 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	/* OK, itemIndex says what to return */
 	currItem = &so->currPos.items[so->currPos.itemIndex];
 	scan->xs_heaptid = currItem->heapTid;
-	if (scan->xs_want_itup)
-		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	if (scan->xs_want_itup) /* index-only scan */
+	{
+ 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+	}
+	else if (so->prefetch_maximum > 0)
+	{
+		int prefetchLimit, prefetchDistance;
+
+		/* Neon: prefetch referenced heap pages.
+		 * As far as it is difficult to predict how much items index scan will return
+		 * we do not want to prefetch many heap pages from the very beginning because
+		 * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP
+		 * at each index scan iteration until it reaches prefetch_maximum.
+		 */
+
+		/* Advance pefetch distance until it reaches prefetch_maximum */
+		if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum)
+			so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP;
+		else
+			so->current_prefetch_distance = so->prefetch_maximum;
+
+		/* How much we can prefetch */
+		prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1);
+
+		/* Active prefeth requests */
+		prefetchDistance = so->n_prefetch_requests;
+ 
+		/*
+		 * Consume one prefetch request (if any)
+		 */
+		if (prefetchDistance != 0)
+			prefetchDistance -= 1;
+
+		/* Keep number of active prefetch requests equal to the current prefetch distance.
+		 * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration,
+		 * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations
+		 */
+		if (ScanDirectionIsForward(dir))
+		{
+			while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem)
+			{
+				BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid);
+				PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno);
+				prefetchDistance += 1;
+			}
+		}
+		else
+		{
+			while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem)
+			{
+				BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid);
+				PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno);
+				prefetchDistance += 1;
+			}
+		}
+		so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */
+	}
 
 	return true;
 }
@@ -1924,6 +2094,30 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
 		so->markItemIndex = -1;
 	}
 
+	if (scan->xs_want_itup && so->prefetch_maximum > 0) /* Prefetching of leaf pages for index-only scan */
+	{
+		/* Advance pefetch distance until it reaches prefetch_maximum */
+		if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum)
+			so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP;
+
+		so->n_prefetch_requests -= 1; /* we load next leaf page, so decrement number of active prefetch requests */
+
+		/* Check if the are more children to prefetch at current parent  page */
+		if (so->last_prefetch_index == so->n_prefetch_blocks && so->next_parent != P_NONE)
+		{
+			/* we have prefetched all items from current parent page, let's move to the next parent page */
+			_bt_read_parent_for_prefetch(scan, so->next_parent, dir);
+			so->n_prefetch_requests -= 1; /* loading parent page consumes one more prefetch request */
+		}
+
+		/* Try to keep number of active prefetch requests equal to current prefetch distance */
+		while (so->n_prefetch_requests < so->current_prefetch_distance && so->last_prefetch_index < so->n_prefetch_blocks)
+		{
+			so->n_prefetch_requests += 1;
+			PrefetchBuffer(scan->indexRelation, MAIN_FORKNUM, so->prefetch_blocks[so->last_prefetch_index++]);
+		}
+	}
+
 	if (ScanDirectionIsForward(dir))
 	{
 		/* Walk right to the next page with data */
@@ -2328,7 +2522,7 @@ _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot)
  */
 Buffer
 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
-				 Snapshot snapshot)
+				 BlockNumber* parent, Snapshot snapshot)
 {
 	Buffer		buf;
 	Page		page;
@@ -2336,6 +2530,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 	OffsetNumber offnum;
 	BlockNumber blkno;
 	IndexTuple	itup;
+	BlockNumber parent_blocknum = P_NONE;
 
 	/*
 	 * If we are looking for a leaf page, okay to descend from fast root;
@@ -2353,6 +2548,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 	page = BufferGetPage(buf);
 	TestForOldSnapshot(snapshot, rel, page);
 	opaque = BTPageGetOpaque(page);
+	blkno = BufferGetBlockNumber(buf);
 
 	for (;;)
 	{
@@ -2391,6 +2587,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 			offnum = P_FIRSTDATAKEY(opaque);
 
 		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+		parent_blocknum = blkno;
 		blkno = BTreeTupleGetDownLink(itup);
 
 		buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
@@ -2398,6 +2595,9 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 		opaque = BTPageGetOpaque(page);
 	}
 
+	if (parent)
+		*parent = parent_blocknum;
+
 	return buf;
 }
 
@@ -2420,13 +2620,15 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 	BTPageOpaque opaque;
 	OffsetNumber start;
 	BTScanPosItem *currItem;
+	BlockNumber	parent;
 
 	/*
 	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
 	 * version of _bt_search().  We don't maintain a stack since we know we
 	 * won't need it.
 	 */
-	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot);
+	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), &parent,
+						   scan->xs_snapshot);
 
 	if (!BufferIsValid(buf))
 	{
@@ -2439,6 +2641,17 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		return false;
 	}
 
+	/* Start prefetching for index-only scan */
+	if (so->prefetch_maximum > 0 && parent != P_NONE && scan->xs_want_itup) /* index only scan */
+	{
+		_bt_read_parent_for_prefetch(scan, parent, dir);
+		so->n_prefetch_requests = so->last_prefetch_index =
+			Min(so->prefetch_maximum, so->n_prefetch_blocks);
+
+		for (int i = 0; i < so->last_prefetch_index; i++)
+			PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]);
+	}
+
 	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
 	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 4443f1918df..7484149cbb7 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -85,6 +85,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	smgr_start_unlogged_build(index->rd_smgr);
+
 	/*
 	 * Initialize the meta page and root pages
 	 */
@@ -131,6 +133,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 
 	SpGistUpdateMetaPage(index);
 
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 	/*
 	 * We didn't write WAL records as we built the index, so if WAL-logging is
 	 * required, write all pages to the WAL now.
@@ -140,8 +144,13 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
+		SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator,
+						  MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+		SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rlocator.locator, MAIN_FORKNUM);
 	}
 
+	smgr_end_unlogged_build(index->rd_smgr);
+
 	result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult));
 	result->heap_tuples = reltuples;
 	result->index_tuples = buildstate.indtuples;
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index 8a5b540c809..71d770f0e44 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -27,6 +27,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "utils/snapmgr.h"
+#include "utils/spccache.h"
 
 
 /* Entry in pending-list of TIDs we need to revisit */
@@ -796,7 +797,14 @@ spgvacuumscan(spgBulkDeleteState *bds)
 	Relation	index = bds->info->index;
 	bool		needLock;
 	BlockNumber num_pages,
-				blkno;
+				blkno,
+				prefetch_blkno;
+	int			io_concurrency;
+
+	/* initiate concurrency */
+	io_concurrency = get_tablespace_maintenance_io_concurrency(
+		index->rd_rel->reltablespace
+	);
 
 	/* Finish setting up spgBulkDeleteState */
 	initSpGistState(&bds->spgstate, index);
@@ -824,6 +832,8 @@ spgvacuumscan(spgBulkDeleteState *bds)
 	 * in btvacuumscan().
 	 */
 	blkno = SPGIST_METAPAGE_BLKNO + 1;
+	prefetch_blkno = blkno;
+
 	for (;;)
 	{
 		/* Get the current relation length */
@@ -836,9 +846,19 @@ spgvacuumscan(spgBulkDeleteState *bds)
 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
 			break;
+
+		if (prefetch_blkno < blkno)
+			prefetch_blkno = blkno;
+		for (; prefetch_blkno < num_pages &&
+			   prefetch_blkno < blkno + io_concurrency; prefetch_blkno++)
+			PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno);
+
 		/* Iterate over pages, then loop back to recheck length */
 		for (; blkno < num_pages; blkno++)
 		{
+			if (io_concurrency > 0 && prefetch_blkno < num_pages)
+				PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno++);
+
 			spgvacuumpage(bds, blkno);
 			/* empty the pending-list after each page */
 			if (bds->pendingList != NULL)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8b0710abe60..f25fe14118d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -85,6 +85,7 @@
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/large_object.h"
@@ -138,6 +139,8 @@ int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
 int			wal_decode_buffer_size = 512 * 1024;
 bool		track_wal_io_timing = false;
+uint64		predefined_sysidentifier;
+int			lastWrittenLsnCacheSize;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -204,6 +207,25 @@ const struct config_enum_entry archive_mode_options[] = {
 	{NULL, 0, false}
 };
 
+typedef struct LastWrittenLsnCacheEntry
+{
+	BufferTag	key;
+	XLogRecPtr	lsn;
+	/* double linked list for LRU replacement algorithm */
+	dlist_node	lru_node;
+} LastWrittenLsnCacheEntry;
+
+
+/*
+ * Cache of last written LSN for each relation page.
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
+ * relation metadata update.
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
+ * pages are replaced using LRU algorithm, based on L2-list.
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
+ */
+static HTAB *lastWrittenLsnCache;
+
 /*
  * Statistics for current checkpoint are collected in this global struct.
  * Because only the checkpointer or a stand-alone backend can perform
@@ -556,6 +578,26 @@ typedef struct XLogCtlData
 	 */
 	XLogRecPtr	lastFpwDisableRecPtr;
 
+	/*
+	 * Maximal last written LSN for pages not present in lastWrittenLsnCache
+	 */
+	XLogRecPtr  maxLastWrittenLsn;
+
+	/*
+	 * Double linked list to implement LRU replacement policy for last written LSN cache.
+	 * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
+	 */
+	dlist_head lastWrittenLsnLRU;
+
+	/* neon: copy of startup's RedoStartLSN for walproposer's use */
+	XLogRecPtr	RedoStartLSN;
+
+	/*
+	 * size of a timeline in zenith pageserver.
+	 * used to enforce timeline size limit.
+	 */
+	uint64 		zenithCurrentClusterSize;
+
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
@@ -638,6 +680,15 @@ static bool holdingAllLocks = false;
 static MemoryContext walDebugCxt = NULL;
 #endif
 
+
+/*
+ * Variables read from 'zenith.signal' file.
+ */
+bool		ZenithRecoveryRequested = false;
+XLogRecPtr	zenithLastRec = InvalidXLogRecPtr;
+bool		zenithWriteOk = false;
+
+
 static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
 										XLogRecPtr EndOfLog,
 										TimeLineID newTLI);
@@ -3281,11 +3332,15 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 
 	XLogFilePath(path, tli, *segno, wal_segment_size);
 
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	if (!XLogCtl->InstallXLogFileSegmentActive)
-	{
-		LWLockRelease(ControlFileLock);
-		return false;
+	if (XLogCtl)
+ 	{
+		/* Neon: in case of sync-safekeepers shared memory is not inialized */
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		if (!XLogCtl->InstallXLogFileSegmentActive)
+		{
+			LWLockRelease(ControlFileLock);
+			return false;
+		}
 	}
 
 	if (!find_free)
@@ -3301,7 +3356,8 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 			if ((*segno) >= max_segno)
 			{
 				/* Failed to find a free slot within specified range */
-				LWLockRelease(ControlFileLock);
+				if (XLogCtl)
+					LWLockRelease(ControlFileLock);
 				return false;
 			}
 			(*segno)++;
@@ -3312,12 +3368,14 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 	Assert(access(path, F_OK) != 0 && errno == ENOENT);
 	if (durable_rename(tmppath, path, LOG) != 0)
 	{
-		LWLockRelease(ControlFileLock);
+		if (XLogCtl)
+			LWLockRelease(ControlFileLock);
 		/* durable_rename already emitted log message */
 		return false;
 	}
 
-	LWLockRelease(ControlFileLock);
+	if (XLogCtl)
+		LWLockRelease(ControlFileLock);
 
 	return true;
 }
@@ -4480,11 +4538,8 @@ GetActiveWalLevelOnStandby(void)
 	return ControlFile->wal_level;
 }
 
-/*
- * Initialization of shared memory for XLOG
- */
-Size
-XLOGShmemSize(void)
+static Size
+XLOGCtlShmemSize(void)
 {
 	Size		size;
 
@@ -4533,6 +4588,16 @@ XLOGShmemSize(void)
 	return size;
 }
 
+/*
+ * Initialization of shared memory for XLOG
+ */
+Size
+XLOGShmemSize(void)
+{
+	return XLOGCtlShmemSize() +
+		hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry));
+}
+
 void
 XLOGShmemInit(void)
 {
@@ -4560,7 +4625,18 @@ XLOGShmemInit(void)
 
 
 	XLogCtl = (XLogCtlData *)
-		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
+		ShmemInitStruct("XLOG Ctl", XLOGCtlShmemSize(), &foundXLog);
+
+	if (lastWrittenLsnCacheSize > 0)
+	{
+		static HASHCTL info;
+		info.keysize = sizeof(BufferTag);
+		info.entrysize = sizeof(LastWrittenLsnCacheEntry);
+		lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache",
+											lastWrittenLsnCacheSize, lastWrittenLsnCacheSize,
+											&info,
+											HASH_ELEM | HASH_BLOBS);
+	}
 
 	localControlFile = ControlFile;
 	ControlFile = (ControlFileData *)
@@ -4670,10 +4746,17 @@ BootStrapXLOG(void)
 	 * determine the initialization time of the installation, which could
 	 * perhaps be useful sometimes.
 	 */
-	gettimeofday(&tv, NULL);
-	sysidentifier = ((uint64) tv.tv_sec) << 32;
-	sysidentifier |= ((uint64) tv.tv_usec) << 12;
-	sysidentifier |= getpid() & 0xFFF;
+	if (predefined_sysidentifier != 0)
+	{
+		sysidentifier = predefined_sysidentifier;
+	}
+	else
+	{
+		gettimeofday(&tv, NULL);
+		sysidentifier = ((uint64) tv.tv_sec) << 32;
+		sysidentifier |= ((uint64) tv.tv_usec) << 12;
+		sysidentifier |= getpid() & 0xFFF;
+	}
 
 	/* page buffer must be aligned suitably for O_DIRECT */
 	buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
@@ -5026,6 +5109,81 @@ CheckRequiredParameterValues(void)
 	}
 }
 
+static void
+readZenithSignalFile(void)
+{
+	int			fd;
+
+	fd = BasicOpenFile(ZENITH_SIGNAL_FILE, O_RDONLY | PG_BINARY);
+	if (fd >= 0)
+	{
+		struct stat statbuf;
+		char	   *content;
+		char		prev_lsn_str[20];
+
+		/* Slurp the file into a string */
+		if (stat(ZENITH_SIGNAL_FILE, &statbuf) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m",
+							ZENITH_SIGNAL_FILE)));
+		content = palloc(statbuf.st_size + 1);
+		if (read(fd, content, statbuf.st_size) != statbuf.st_size)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							ZENITH_SIGNAL_FILE)));
+		content[statbuf.st_size] = '\0';
+
+		/* Parse it */
+		if (sscanf(content, "PREV LSN: %19s", prev_lsn_str) != 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE)));
+
+		if (strcmp(prev_lsn_str, "invalid") == 0)
+		{
+			/* No prev LSN. Forbid starting up in read-write mode */
+			zenithLastRec = InvalidXLogRecPtr;
+			zenithWriteOk = false;
+		}
+		else if (strcmp(prev_lsn_str, "none") == 0)
+		{
+			/*
+			 * The page server had no valid prev LSN, but assured that it's ok
+			 * to start without it. This happens when you start the compute
+			 * node for the first time on a new branch.
+			 */
+			zenithLastRec = InvalidXLogRecPtr;
+			zenithWriteOk = true;
+		}
+		else
+		{
+			uint32		hi,
+						lo;
+
+			if (sscanf(prev_lsn_str, "%X/%X", &hi, &lo) != 2)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE)));
+			zenithLastRec = ((uint64) hi) << 32 | lo;
+
+			/* If prev LSN is given, it better be valid */
+			if (zenithLastRec == InvalidXLogRecPtr)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid prev-LSN in file \"%s\"", ZENITH_SIGNAL_FILE)));
+			zenithWriteOk = true;
+		}
+		ZenithRecoveryRequested = true;
+		close(fd);
+
+		elog(LOG,
+			 "[ZENITH] found 'zenith.signal' file. setting prev LSN to %X/%X",
+			 LSN_FORMAT_ARGS(zenithLastRec));
+	}
+}
+
 /*
  * This must be called ONCE during postmaster or standalone-backend startup
  */
@@ -5057,10 +5215,15 @@ StartupXLOG(void)
 		   CurrentResourceOwner == AuxProcessResourceOwner);
 	CurrentResourceOwner = AuxProcessResourceOwner;
 
+	/*
+	 * Read zenith.signal before anything else.
+	 */
+	readZenithSignalFile();
+
 	/*
 	 * Check that contents look valid.
 	 */
-	if (!XRecOffIsValid(ControlFile->checkPoint))
+	if (!XRecOffIsValid(ControlFile->checkPoint) && !ZenithRecoveryRequested)
 		ereport(FATAL,
 				(errmsg("control file contains invalid checkpoint location")));
 
@@ -5289,6 +5452,14 @@ StartupXLOG(void)
 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
 	doPageWrites = lastFullPageWrites;
 
+	/*
+	 * Setup last written lsn cache, max written LSN.
+	 * Starting from here, we could be modifying pages through REDO, which requires
+	 * the existance of maxLwLsn + LwLsn LRU.
+	 */
+	XLogCtl->maxLastWrittenLsn = RedoRecPtr;
+	dlist_init(&XLogCtl->lastWrittenLsnLRU);
+
 	/* REDO */
 	if (InRecovery)
 	{
@@ -6081,6 +6252,186 @@ GetInsertRecPtr(void)
 	return recptr;
 }
 
+/*
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
+ * It returns an upper bound for the last written LSN of a given page,
+ * either from a cached last written LSN or a global maximum last written LSN.
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
+ * If cache is large enough, iterating through all hash items may be rather expensive.
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
+ */
+XLogRecPtr
+GetLastWrittenLSN(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno)
+{
+	XLogRecPtr lsn;
+	LastWrittenLsnCacheEntry* entry;
+
+	Assert(lastWrittenLsnCacheSize != 0);
+
+	LWLockAcquire(LastWrittenLsnLock, LW_SHARED);
+
+	/* Maximal last written LSN among all non-cached pages */
+	lsn = XLogCtl->maxLastWrittenLsn;
+
+	if (rlocator.relNumber != InvalidOid)
+	{
+		BufferTag key;
+		key.spcOid = rlocator.spcOid;
+		key.dbOid = rlocator.dbOid;
+		key.relNumber = rlocator.relNumber;
+		key.forkNum = forknum;
+		key.blockNum = blkno;
+		entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL);
+		if (entry != NULL)
+			lsn = entry->lsn;
+	}
+	else
+	{
+		HASH_SEQ_STATUS seq;
+		/* Find maximum of all cached LSNs */
+		hash_seq_init(&seq, lastWrittenLsnCache);
+		while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL)
+		{
+			if (entry->lsn > lsn)
+				lsn = entry->lsn;
+		}
+	}
+	LWLockRelease(LastWrittenLsnLock);
+
+	return lsn;
+}
+
+/*
+ * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range.
+ * We maintain cache of last written LSNs with limited size and LRU replacement
+ * policy. Keeping last written LSN for each page allows to use old LSN when
+ * requesting pages of unchanged or appended relations. Also it is critical for
+ * efficient work of prefetch in case massive update operations (like vacuum or remove).
+ *
+ * rlocator.relNumber can be InvalidOid, in this case maxLastWrittenLsn is updated.
+ * SetLastWrittenLsn with dummy rlocator is used by createdb and dbase_redo functions.
+ */
+void
+SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileLocator rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks)
+{
+	if (lsn == InvalidXLogRecPtr || n_blocks == 0 || lastWrittenLsnCacheSize == 0)
+		return;
+
+	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
+	if (rlocator.relNumber == InvalidOid)
+	{
+		if (lsn > XLogCtl->maxLastWrittenLsn)
+			XLogCtl->maxLastWrittenLsn = lsn;
+	}
+	else
+	{
+		LastWrittenLsnCacheEntry* entry;
+		BufferTag key;
+		bool found;
+		BlockNumber i;
+
+		key.spcOid = rlocator.spcOid;
+		key.dbOid = rlocator.dbOid;
+		key.relNumber = rlocator.relNumber;
+		key.forkNum = forknum;
+		for (i = 0; i < n_blocks; i++)
+		{
+			key.blockNum = from + i;
+			entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
+			if (found)
+			{
+				if (lsn > entry->lsn)
+					entry->lsn = lsn;
+				/* Unlink from LRU list */
+				dlist_delete(&entry->lru_node);
+			}
+			else
+			{
+				entry->lsn = lsn;
+				if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize)
+				{
+					/* Replace least recently used entry */
+					LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&XLogCtl->lastWrittenLsnLRU));
+					/* Adjust max LSN for not cached relations/chunks if needed */
+					if (victim->lsn > XLogCtl->maxLastWrittenLsn)
+						XLogCtl->maxLastWrittenLsn = victim->lsn;
+
+					hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL);
+				}
+			}
+			/* Link to the end of LRU list */
+			dlist_push_tail(&XLogCtl->lastWrittenLsnLRU, &entry->lru_node);
+		}
+	}
+	LWLockRelease(LastWrittenLsnLock);
+}
+
+/*
+ * SetLastWrittenLSNForBlock -- Set maximal LSN for block
+ */
+void
+SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno)
+{
+	SetLastWrittenLSNForBlockRange(lsn, rlocator, forknum, blkno, 1);
+}
+
+/*
+ * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata
+ */
+void
+SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileLocator rlocator, ForkNumber forknum)
+{
+	SetLastWrittenLSNForBlock(lsn, rlocator, forknum, REL_METADATA_PSEUDO_BLOCKNO);
+}
+
+/*
+ * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database
+ */
+void
+SetLastWrittenLSNForDatabase(XLogRecPtr lsn)
+{
+	RelFileLocator dummyNode = {InvalidOid, InvalidOid, InvalidOid};
+	SetLastWrittenLSNForBlock(lsn, dummyNode, MAIN_FORKNUM, 0);
+}
+
+void
+SetRedoStartLsn(XLogRecPtr RedoStartLSN)
+{
+	XLogCtl->RedoStartLSN = RedoStartLSN;
+}
+
+/*
+ * RedoStartLsn is set only once by startup process, locking is not required
+ * after its exit.
+ */
+XLogRecPtr
+GetRedoStartLsn(void)
+{
+	return XLogCtl->RedoStartLSN;
+}
+
+
+uint64
+GetZenithCurrentClusterSize(void)
+{
+	uint64 size;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	size = XLogCtl->zenithCurrentClusterSize;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return size;
+}
+
+
+void
+SetZenithCurrentClusterSize(uint64 size)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->zenithCurrentClusterSize = size;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
+
+
 /*
  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
  * position known to be fsync'd to disk. This should only be used on a
@@ -6089,8 +6440,6 @@ GetInsertRecPtr(void)
 XLogRecPtr
 GetFlushRecPtr(TimeLineID *insertTLI)
 {
-	Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
-
 	SpinLockAcquire(&XLogCtl->info_lck);
 	LogwrtResult = XLogCtl->LogwrtResult;
 	SpinLockRelease(&XLogCtl->info_lck);
@@ -7961,6 +8310,7 @@ xlog_redo(XLogReaderState *record)
 		for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
 		{
 			Buffer		buffer;
+			XLogRedoAction result;
 
 			if (!XLogRecHasBlockImage(record, block_id))
 			{
@@ -7969,9 +8319,23 @@ xlog_redo(XLogReaderState *record)
 				continue;
 			}
 
-			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
+			result = XLogReadBufferForRedo(record, block_id, &buffer);
+
+			if (result == BLK_DONE && (!IsUnderPostmaster || StandbyMode))
+			{
+				/*
+				 * NEON: In the special WAL redo process, blocks that are being
+				 * ignored return BLK_DONE. Accept that.
+				 * Additionally, in standby mode, blocks that are not present
+				 * in shared buffers are ignored during replay, so we also
+				 * ignore those blocks.
+				 */
+			}
+			else if (result != BLK_RESTORED)
 				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
-			UnlockReleaseBuffer(buffer);
+
+			if (buffer != InvalidBuffer)
+				UnlockReleaseBuffer(buffer);
 		}
 	}
 	else if (info == XLOG_BACKUP_END)
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 258cbd70355..a8901e1402f 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -37,9 +37,11 @@
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "replication/origin.h"
+#include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "utils/memutils.h"
+#include "utils/wait_event.h"
 
 /*
  * Guess the maximum buffer size required to store a compressed version of
@@ -87,6 +89,11 @@ typedef struct
 	char		compressed_page[COMPRESS_BUFSIZE];
 } registered_buffer;
 
+/* GUCs */
+int			max_replication_apply_lag;
+int			max_replication_flush_lag;
+int			max_replication_write_lag;
+
 static registered_buffer *registered_buffers;
 static int	max_registered_buffers; /* allocated size */
 static int	max_registered_block_id = 0;	/* highest block_id + 1 currently
@@ -488,6 +495,11 @@ XLogInsert(RmgrId rmid, uint8 info)
 		return EndPos;
 	}
 
+	if (delay_backend_us != NULL && delay_backend_us() > 0)
+	{
+		InterruptPending = true;
+	}
+
 	do
 	{
 		XLogRecPtr	RedoRecPtr;
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
index 539928cb854..1712dc6c7c1 100644
--- a/src/backend/access/transam/xlogprefetcher.c
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -721,8 +721,10 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 			 * We could try to have a fast path for repeated references to the
 			 * same relation (with some scheme to handle invalidations
 			 * safely), but for now we'll call smgropen() every time.
+			 *
+			 * Only permanent relations are WAL-logged, so RELPERSISTENCE_PERMANENT.
 			 */
-			reln = smgropen(block->rlocator, InvalidBackendId);
+			reln = smgropen(block->rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 			/*
 			 * If the relation file doesn't exist on disk, for example because
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index a1363e3b8f3..5f46b007acc 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -232,7 +232,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt,
 void
 XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
 {
-	Assert(!XLogRecPtrIsInvalid(RecPtr));
+	Assert(!XLogRecPtrIsInvalid(RecPtr) || state->skip_lsn_checks);
 
 	ResetDecoder(state);
 
@@ -256,6 +256,14 @@ XLogReleasePreviousRecord(XLogReaderState *state)
 	if (!state->record)
 		return InvalidXLogRecPtr;
 
+#define SKIP_INVALID_RECORD(rec_ptr) do { \
+										rec_ptr = MAXALIGN(rec_ptr + 1); \
+										if (rec_ptr % XLOG_BLCKSZ <= MAXALIGN(1)) \
+											goto restart; \
+										else \
+											goto skip_invalid; \
+									} while (0);
+
 	/*
 	 * Remove it from the decoded record queue.  It must be the oldest item
 	 * decoded, decode_queue_head.
@@ -436,7 +444,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
  * Return NULL if there is no space in the decode buffer and allow_oversized
  * is false, or if memory allocation fails for an oversized buffer.
  */
-static DecodedXLogRecord *
+DecodedXLogRecord *
 XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized)
 {
 	size_t		required_space = DecodeXLogRecordRequiredSpace(xl_tot_len);
@@ -551,7 +559,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 		 * valid record starting position or alternatively to the beginning of
 		 * a page. See the header comments for XLogBeginRead.
 		 */
-		Assert(RecPtr % XLOG_BLCKSZ == 0 || XRecOffIsValid(RecPtr));
+		Assert(RecPtr % XLOG_BLCKSZ == 0 || XRecOffIsValid(RecPtr) || state->skip_lsn_checks);
 		randAccess = true;
 	}
 
@@ -590,18 +598,24 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 	}
 	else if (targetRecOff < pageHeaderSize)
 	{
-		report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u",
-							  LSN_FORMAT_ARGS(RecPtr),
-							  pageHeaderSize, targetRecOff);
-		goto err;
+		if(!state->skip_page_validation)
+		{
+			report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u",
+								  LSN_FORMAT_ARGS(RecPtr),
+								  pageHeaderSize, targetRecOff);
+			goto err;
+		}
 	}
 
 	if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
 		targetRecOff == pageHeaderSize)
 	{
-		report_invalid_record(state, "contrecord is requested by %X/%X",
-							  LSN_FORMAT_ARGS(RecPtr));
-		goto err;
+		if(!state->skip_page_validation)
+		{
+			report_invalid_record(state, "contrecord is requested by %X/%X",
+								  LSN_FORMAT_ARGS(RecPtr));
+			goto err;
+		}
 	}
 
 	/* ReadPageInternal has verified the page header */
@@ -616,6 +630,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 	 * cannot access any other fields until we've verified that we got the
 	 * whole header.
 	 */
+skip_invalid:
 	record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ);
 	total_len = record->xl_tot_len;
 
@@ -631,7 +646,13 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 	{
 		if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record,
 								   randAccess))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
+
 		gotheader = true;
 	}
 	else
@@ -639,11 +660,16 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 		/* There may be no next page if it's too small. */
 		if (total_len < SizeOfXLogRecord)
 		{
-			report_invalid_record(state,
-								  "invalid record length at %X/%X: expected at least %u, got %u",
-								  LSN_FORMAT_ARGS(RecPtr),
-								  (uint32) SizeOfXLogRecord, total_len);
-			goto err;
+			if(!state->skip_invalid_records)
+			{
+				report_invalid_record(state,
+									  "invalid record length at %X/%X: expected at least %u, got %u",
+									  LSN_FORMAT_ARGS(RecPtr),
+									  (uint32) SizeOfXLogRecord, total_len);
+				goto err;
+			}
+
+			SKIP_INVALID_RECORD(RecPtr);
 		}
 		/* We'll validate the header once we have the next page. */
 		gotheader = false;
@@ -728,10 +754,15 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 			/* Check that the continuation on next page looks valid */
 			if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
 			{
-				report_invalid_record(state,
-									  "there is no contrecord flag at %X/%X",
-									  LSN_FORMAT_ARGS(RecPtr));
-				goto err;
+				if(!state->skip_invalid_records)
+				{
+					report_invalid_record(state,
+										  "there is no contrecord flag at %X/%X",
+										  LSN_FORMAT_ARGS(RecPtr));
+					goto err;
+				}
+
+				SKIP_INVALID_RECORD(RecPtr);
 			}
 
 			/*
@@ -741,12 +772,17 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 			if (pageHeader->xlp_rem_len == 0 ||
 				total_len != (pageHeader->xlp_rem_len + gotlen))
 			{
-				report_invalid_record(state,
-									  "invalid contrecord length %u (expected %lld) at %X/%X",
-									  pageHeader->xlp_rem_len,
-									  ((long long) total_len) - gotlen,
-									  LSN_FORMAT_ARGS(RecPtr));
-				goto err;
+				if(!state->skip_invalid_records)
+				{
+					report_invalid_record(state,
+										  "invalid contrecord length %u (expected %lld) at %X/%X",
+										  pageHeader->xlp_rem_len,
+										  ((long long) total_len) - gotlen,
+										  LSN_FORMAT_ARGS(RecPtr));
+					goto err;
+				}
+
+				SKIP_INVALID_RECORD(RecPtr);
 			}
 
 			/* Append the continuation from this page to the buffer */
@@ -777,7 +813,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 				record = (XLogRecord *) state->readRecordBuf;
 				if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr,
 										   record, randAccess))
-					goto err;
+				{
+					if(!state->skip_invalid_records)
+						goto err;
+
+					SKIP_INVALID_RECORD(RecPtr);
+				}
 				gotheader = true;
 			}
 
@@ -807,7 +848,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 
 		record = (XLogRecord *) state->readRecordBuf;
 		if (!ValidXLogRecord(state, record, RecPtr))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
 
 		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf);
 		state->DecodeRecPtr = RecPtr;
@@ -826,7 +872,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking)
 
 		/* Record does not cross a page boundary */
 		if (!ValidXLogRecord(state, record, RecPtr))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
 
 		state->NextRecPtr = RecPtr + MAXALIGN(total_len);
 
@@ -1023,7 +1074,8 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
 		Assert(readLen == XLOG_BLCKSZ);
 
 		if (!XLogReaderValidatePageHeader(state, targetSegmentPtr,
-										  state->readBuf))
+										  state->readBuf) &&
+			!state->skip_page_validation)
 			goto err;
 	}
 
@@ -1064,7 +1116,8 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
 	/*
 	 * Now that we know we have the full header, validate it.
 	 */
-	if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr))
+	if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr) &&
+		!state->skip_page_validation)
 		goto err;
 
 	/* update read state information */
@@ -1123,7 +1176,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 		 * We can't exactly verify the prev-link, but surely it should be less
 		 * than the record's own address.
 		 */
-		if (!(record->xl_prev < RecPtr))
+		if (!(record->xl_prev < RecPtr) && !state->skip_lsn_checks)
 		{
 			report_invalid_record(state,
 								  "record with incorrect prev-link %X/%X at %X/%X",
@@ -1139,7 +1192,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 		 * check guards against torn WAL pages where a stale but valid-looking
 		 * WAL record starts on a sector boundary.
 		 */
-		if (record->xl_prev != PrevRecPtr)
+		if (record->xl_prev != PrevRecPtr && !state->skip_lsn_checks)
 		{
 			report_invalid_record(state,
 								  "record with incorrect prev-link %X/%X at %X/%X",
@@ -1284,7 +1337,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
 	 * check typically fails when an old WAL segment is recycled, and hasn't
 	 * yet been overwritten with new data yet.
 	 */
-	if (hdr->xlp_pageaddr != recptr)
+	if (hdr->xlp_pageaddr != recptr && !state->skip_lsn_checks)
 	{
 		char		fname[MAXFNAMELEN];
 
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index becc2bda62e..e555f1c2ac3 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -338,6 +338,7 @@ typedef struct XLogRecoveryCtlData
 	XLogRecPtr	lastReplayedReadRecPtr; /* start position */
 	XLogRecPtr	lastReplayedEndRecPtr;	/* end+1 position */
 	TimeLineID	lastReplayedTLI;	/* timeline */
+	ConditionVariable replayProgressCV; /* CV for waiters */
 
 	/*
 	 * When we're currently replaying a record, ie. in a redo function,
@@ -467,6 +468,7 @@ XLogRecoveryShmemInit(void)
 
 	SpinLockInit(&XLogRecoveryCtl->info_lck);
 	InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
+	ConditionVariableInit(&XLogRecoveryCtl->replayProgressCV);
 	ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
 }
 
@@ -488,6 +490,64 @@ EnableStandbyMode(void)
 	disable_startup_progress_timeout();
 }
 
+/*
+ * Wait for recovery to complete replaying all WAL up to and including
+ * redoEndRecPtr.
+ *
+ * This gets woken up for every WAL record replayed, so make sure you're not
+ * trying to wait an LSN that is too far in the future.
+ */
+void
+XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr)
+{
+	static XLogRecPtr replayRecPtr = 0;
+
+	if (!RecoveryInProgress())
+		return;
+
+	/*
+	 * Check the backend-local variable first, we may be able to skip accessing
+	 * shared memory (which requires locking)
+	 */
+	if (redoEndRecPtr <= replayRecPtr)
+		return;
+
+	replayRecPtr = GetXLogReplayRecPtr(NULL);
+
+	/*
+	 * Check again if we're going to need to wait, now that we've updated
+	 * the local cached variable.
+	 */
+	if (redoEndRecPtr <= replayRecPtr)
+		return;
+
+	/*
+	 * We need to wait for the variable, so prepare for that.
+	 * 
+	 * Note: This wakes up every time a WAL record is replayed, so this can
+	 * be expensive.
+	 */
+	ConditionVariablePrepareToSleep(&XLogRecoveryCtl->replayProgressCV);
+
+	while (redoEndRecPtr > replayRecPtr)
+	{
+		bool timeout;
+		timeout = ConditionVariableTimedSleep(&XLogRecoveryCtl->replayProgressCV,
+											  10000000, /* 10 seconds */
+											  WAIT_EVENT_RECOVERY_WAL_STREAM);
+
+		replayRecPtr = GetXLogReplayRecPtr(NULL);
+
+		if (timeout)
+			ereport(LOG,
+					(errmsg("Waiting for recovery to catch up to %X/%X (currently %X/%X)",
+							LSN_FORMAT_ARGS(redoEndRecPtr),
+							LSN_FORMAT_ARGS(replayRecPtr))));
+	}
+
+	ConditionVariableCancelSleep();
+}
+
 /*
  * Prepare the system for WAL recovery, if needed.
  *
@@ -564,6 +624,9 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
 			ereport(LOG,
 					(errmsg("starting point-in-time recovery to earliest consistent point")));
+		else if (ZenithRecoveryRequested)
+			ereport(LOG,
+					(errmsg("starting zenith recovery")));
 		else
 			ereport(LOG,
 					(errmsg("starting archive recovery")));
@@ -704,6 +767,33 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		/* tell the caller to delete it later */
 		haveBackupLabel = true;
 	}
+	else if (ZenithRecoveryRequested)
+	{
+		/*
+		 * Zenith hacks to spawn compute node without WAL.  Pretend that we
+		 * just finished reading the record that started at 'zenithLastRec'
+		 * and ended at checkpoint.redo
+		 */
+		elog(LOG, "starting with zenith basebackup at LSN %X/%X, prev %X/%X",
+			 LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo),
+			 LSN_FORMAT_ARGS(zenithLastRec));
+
+		CheckPointLoc = zenithLastRec;
+		CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		// FIXME needs review. rebase of ff41b709abea6a9c42100a4fcb0ff434b2c846c9
+		// Is it still relevant?
+		/* make basebackup LSN available for walproposer */
+		SetRedoStartLsn(RedoStartLSN);
+		//EndRecPtr = ControlFile->checkPointCopy.redo;
+
+		memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint));
+		wasShutdown = true;
+
+		/* Initialize expectedTLEs, like ReadRecord() does */
+		expectedTLEs = readTimeLineHistory(checkPoint.ThisTimeLineID);
+		XLogPrefetcherBeginRead(xlogprefetcher, ControlFile->checkPointCopy.redo);
+	}
 	else
 	{
 		/*
@@ -765,6 +855,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		CheckPointLoc = ControlFile->checkPoint;
 		CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
 		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		SetRedoStartLsn(RedoStartLSN);
 		RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
 		record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
 									  CheckPointTLI);
@@ -854,7 +945,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 				(errmsg("invalid next transaction ID")));
 
 	/* sanity check */
-	if (checkPoint.redo > CheckPointLoc)
+	if (checkPoint.redo > CheckPointLoc && !ZenithRecoveryRequested)
 		ereport(PANIC,
 				(errmsg("invalid redo in checkpoint record")));
 
@@ -1452,8 +1543,13 @@ FinishWalRecovery(void)
 		lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
 		lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
 	}
-	XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
-	(void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
+
+	if (!ZenithRecoveryRequested)
+	{
+		XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
+		(void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
+	}
+
 	endOfLog = xlogreader->EndRecPtr;
 
 	/*
@@ -1487,11 +1583,55 @@ FinishWalRecovery(void)
 		}
 	}
 
+	/*
+	 * When starting from a zenith base backup, we don't have WAL. Initialize
+	 * the WAL page where we will start writing new records from scratch,
+	 * instead.
+	 */
+	if (ZenithRecoveryRequested)
+	{
+		if (!zenithWriteOk)
+		{
+			/*
+			 * We cannot start generating new WAL if we don't have a valid prev-LSN
+			 * to use for the first new WAL record. (Shouldn't happen.)
+			 */
+			ereport(ERROR,
+					(errmsg("cannot start in read-write mode from this base backup")));
+		}
+		else
+		{
+			int offs = endOfLog % XLOG_BLCKSZ;
+			char *page = palloc0(offs);
+			XLogRecPtr pageBeginPtr = endOfLog - offs;
+			int lastPageSize = ((pageBeginPtr % wal_segment_size) == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD;
+
+			XLogPageHeader xlogPageHdr = (XLogPageHeader) (page);
+
+			xlogPageHdr->xlp_pageaddr = pageBeginPtr;
+			xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
+			xlogPageHdr->xlp_tli = recoveryTargetTLI;
+			/*
+			 * If we start writing with offset from page beginning, pretend in
+			 * page header there is a record ending where actual data will
+			 * start.
+			 */
+			xlogPageHdr->xlp_rem_len = offs - lastPageSize;
+			xlogPageHdr->xlp_info = (xlogPageHdr->xlp_rem_len > 0) ? XLP_FIRST_IS_CONTRECORD : 0;
+			readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size);
+
+			result->lastPageBeginPtr = pageBeginPtr;
+			result->lastPage = page;
+			elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
+
+			// FIXME: should we unlink zenith.signal?
+		}
+	}
 	/*
 	 * Copy the last partial block to the caller, for initializing the WAL
 	 * buffer for appending new WAL.
 	 */
-	if (endOfLog % XLOG_BLCKSZ != 0)
+	else if (endOfLog % XLOG_BLCKSZ != 0)
 	{
 		char	   *page;
 		int			len;
@@ -1543,7 +1683,10 @@ ShutdownWalRecovery(void)
 	char		recoveryPath[MAXPGPATH];
 
 	/* Final update of pg_stat_recovery_prefetch. */
-	XLogPrefetcherComputeStats(xlogprefetcher);
+	if (!ZenithRecoveryRequested)
+	{
+		XLogPrefetcherComputeStats(xlogprefetcher);
+	}
 
 	/* Shut down xlogreader */
 	if (readFile >= 0)
@@ -1552,7 +1695,11 @@ ShutdownWalRecovery(void)
 		readFile = -1;
 	}
 	XLogReaderFree(xlogreader);
-	XLogPrefetcherFree(xlogprefetcher);
+
+	if (!ZenithRecoveryRequested)
+	{
+		XLogPrefetcherFree(xlogprefetcher);
+	}
 
 	if (ArchiveRecoveryRequested)
 	{
@@ -1642,7 +1789,10 @@ PerformWalRecovery(void)
 	else
 	{
 		/* just have to read next record after CheckPoint */
-		Assert(xlogreader->ReadRecPtr == CheckPointLoc);
+		if (ZenithRecoveryRequested)
+			xlogreader->ReadRecPtr = CheckPointLoc;
+		else
+			Assert(xlogreader->ReadRecPtr == CheckPointLoc);
 		replayTLI = CheckPointTLI;
 		record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
 	}
@@ -1986,6 +2136,8 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl
 		/* Reset the prefetcher. */
 		XLogPrefetchReconfigure();
 	}
+
+	ConditionVariableBroadcast(&XLogRecoveryCtl->replayProgressCV);
 }
 
 /*
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 43f7b31205d..68df6a3b56c 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -33,6 +33,8 @@
 #include "utils/rel.h"
 
 
+bool	(*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 /* GUC variable */
 bool		ignore_invalid_pages = false;
 
@@ -373,6 +375,21 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 			 block_id);
 	}
 
+	if (redo_read_buffer_filter && redo_read_buffer_filter(record, block_id))
+	{
+		if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+		{
+			*buf = ReadBufferWithoutRelcache(rlocator, forknum,
+											 blkno, mode, NULL, true);
+			return BLK_DONE;
+		}
+		else
+		{
+			*buf = InvalidBuffer;
+			return BLK_DONE;
+		}
+	}
+
 	/*
 	 * Make sure that if the block is marked with WILL_INIT, the caller is
 	 * going to initialize it. And vice versa.
@@ -491,7 +508,7 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum,
 	}
 
 	/* Open the relation at smgr level */
-	smgr = smgropen(rlocator, InvalidBackendId);
+	smgr = smgropen(rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 	/*
 	 * Create the target file if it doesn't already exist.  This lets us cope
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 49e956b2c57..2ef26efc816 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -47,6 +47,7 @@
 
 uint32		bootstrap_data_checksum_version = 0;	/* No checksum */
 
+extern uint64 predefined_sysidentifier;
 
 static void CheckerModeMain(void);
 static void bootstrap_signals(void);
@@ -221,13 +222,23 @@ BootstrapModeMain(int argc, char *argv[], bool check_only)
 	argv++;
 	argc--;
 
-	while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:X:-:")) != -1)
+	while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:s:X:-:")) != -1)
 	{
 		switch (flag)
 		{
 			case 'B':
 				SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV);
 				break;
+			case 's':
+			{
+				char* endptr;
+#ifdef HAVE_STRTOULL
+				predefined_sysidentifier = strtoull(optarg, &endptr, 10);
+#else
+				predefined_sysidentifier = strtoul(optarg, &endptr, 10);
+#endif
+				break;
+			}
 			case 'c':
 			case '-':
 				{
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 2add0534891..c6afda4e64d 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -145,7 +145,7 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
 			return NULL;		/* placate compiler */
 	}
 
-	srel = smgropen(rlocator, backend);
+	srel = smgropen(rlocator, backend, relpersistence);
 	smgrcreate(srel, MAIN_FORKNUM, false);
 
 	if (needs_wal)
@@ -185,6 +185,7 @@ void
 log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
 {
 	xl_smgr_create xlrec;
+	XLogRecPtr lsn;
 
 	/*
 	 * Make an XLOG entry reporting the file creation.
@@ -194,7 +195,8 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
 
 	XLogBeginInsert();
 	XLogRegisterData((char *) &xlrec, sizeof(xlrec));
-	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
+	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
+	SetLastWrittenLSNForRelation(lsn, *rlocator, forkNum);
 }
 
 /*
@@ -678,7 +680,7 @@ smgrDoPendingDeletes(bool isCommit)
 			{
 				SMgrRelation srel;
 
-				srel = smgropen(pending->rlocator, pending->backend);
+				srel = smgropen(pending->rlocator, pending->backend, 0);
 
 				/* allocate the initial array, or extend it, if needed */
 				if (maxrels == 0)
@@ -759,7 +761,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
 		BlockNumber total_blocks = 0;
 		SMgrRelation srel;
 
-		srel = smgropen(pendingsync->rlocator, InvalidBackendId);
+		srel = smgropen(pendingsync->rlocator, InvalidBackendId, 0);
 
 		/*
 		 * We emit newpage WAL records for smaller relations.
@@ -968,7 +970,7 @@ smgr_redo(XLogReaderState *record)
 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
 		SMgrRelation reln;
 
-		reln = smgropen(xlrec->rlocator, InvalidBackendId);
+		reln = smgropen(xlrec->rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 		smgrcreate(reln, xlrec->forkNum, true);
 	}
 	else if (info == XLOG_SMGR_TRUNCATE)
@@ -981,7 +983,7 @@ smgr_redo(XLogReaderState *record)
 		int			nforks = 0;
 		bool		need_fsm_vacuum = false;
 
-		reln = smgropen(xlrec->rlocator, InvalidBackendId);
+		reln = smgropen(xlrec->rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 		/*
 		 * Forcibly create relation if it doesn't exist (which suggests that
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 307729ab7ef..89f9a7a6250 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -276,7 +276,7 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath)
 	rlocator.dbOid = dbid;
 	rlocator.relNumber = relfilenumber;
 
-	smgr = smgropen(rlocator, InvalidBackendId);
+	smgr = smgropen(rlocator, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 	nblocks = smgrnblocks(smgr, MAIN_FORKNUM);
 	smgrclose(smgr);
 
@@ -487,6 +487,8 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo)
 
 		lsn = XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE_WAL_LOG);
 
+		SetLastWrittenLSNForDatabase(lsn);
+
 		/* As always, WAL must hit the disk before the data update does. */
 		XLogFlush(lsn);
 	}
@@ -614,6 +616,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 		/* Record the filesystem change in XLOG */
 		{
 			xl_dbase_create_file_copy_rec xlrec;
+			XLogRecPtr lsn;
 
 			xlrec.db_id = dst_dboid;
 			xlrec.tablespace_id = dsttablespace;
@@ -624,8 +627,10 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 			XLogRegisterData((char *) &xlrec,
 							 sizeof(xl_dbase_create_file_copy_rec));
 
-			(void) XLogInsert(RM_DBASE_ID,
-							  XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE);
+			lsn = XLogInsert(RM_DBASE_ID,
+							 XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE);
+
+			SetLastWrittenLSNForDatabase(lsn);
 		}
 		pfree(srcpath);
 		pfree(dstpath);
@@ -2102,6 +2107,7 @@ movedb(const char *dbname, const char *tblspcname)
 		 */
 		{
 			xl_dbase_create_file_copy_rec xlrec;
+			XLogRecPtr lsn;
 
 			xlrec.db_id = db_id;
 			xlrec.tablespace_id = dst_tblspcoid;
@@ -2112,8 +2118,10 @@ movedb(const char *dbname, const char *tblspcname)
 			XLogRegisterData((char *) &xlrec,
 							 sizeof(xl_dbase_create_file_copy_rec));
 
-			(void) XLogInsert(RM_DBASE_ID,
-							  XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE);
+			lsn = XLogInsert(RM_DBASE_ID,
+							 XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE);
+			// TODO: Do we really need to set the LSN here?
+			SetLastWrittenLSNForDatabase(lsn);
 		}
 
 		/*
@@ -3253,6 +3261,15 @@ dbase_redo(XLogReaderState *record)
 		 */
 		copydir(src_path, dst_path, false);
 
+		/*
+		 * Make sure any future requests to the page server see the new
+		 * database.
+		 */
+		{
+			XLogRecPtr	lsn = record->EndRecPtr;
+			SetLastWrittenLSNForDatabase(lsn);
+		}
+
 		pfree(src_path);
 		pfree(dst_path);
 	}
@@ -3273,6 +3290,16 @@ dbase_redo(XLogReaderState *record)
 		/* Create the database directory with the version file. */
 		CreateDirAndVersionFile(dbpath, xlrec->db_id, xlrec->tablespace_id,
 								true);
+
+		/*
+		 * Make sure any future requests to the page server see the new
+		 * database.
+		 */
+		{
+			XLogRecPtr	lsn = record->EndRecPtr;
+			SetLastWrittenLSNForDatabase(lsn);
+		}
+
 		pfree(dbpath);
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 8570b14f621..8a4f6886b91 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -47,7 +47,6 @@ ExplainOneQuery_hook_type ExplainOneQuery_hook = NULL;
 /* Hook for plugins to get control in explain_get_index_name() */
 explain_get_index_name_hook_type explain_get_index_name_hook = NULL;
 
-
 /* OR-able flags for ExplainXMLTag() */
 #define X_OPENING 0
 #define X_CLOSING 1
@@ -121,6 +120,7 @@ static void show_eval_params(Bitmapset *bms_params, ExplainState *es);
 static const char *explain_get_index_name(Oid indexId);
 static void show_buffer_usage(ExplainState *es, const BufferUsage *usage,
 							  bool planning);
+static void show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info);
 static void show_wal_usage(ExplainState *es, const WalUsage *usage);
 static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir,
 									ExplainState *es);
@@ -186,6 +186,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
 			es->costs = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "buffers") == 0)
 			es->buffers = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "prefetch") == 0)
+			es->prefetch = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "wal") == 0)
 			es->wal = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "settings") == 0)
@@ -543,7 +545,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 	else if (es->analyze)
 		instrument_option |= INSTRUMENT_ROWS;
 
-	if (es->buffers)
+	if (es->buffers || es->prefetch)
 		instrument_option |= INSTRUMENT_BUFFERS;
 	if (es->wal)
 		instrument_option |= INSTRUMENT_WAL;
@@ -2102,6 +2104,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	if (es->wal && planstate->instrument)
 		show_wal_usage(es, &planstate->instrument->walusage);
 
+	/* Show prefetch usage */
+	if (es->prefetch && planstate->instrument)
+		show_prefetch_info(es, &planstate->instrument->bufusage.prefetch);
+
 	/* Prepare per-worker buffer/WAL usage */
 	if (es->workers_state && (es->buffers || es->wal) && es->verbose)
 	{
@@ -3536,6 +3542,34 @@ explain_get_index_name(Oid indexId)
 	return result;
 }
 
+/*
+ * Show prefetch statistics
+ */
+static void
+show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info)
+{
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Prefetch: hits=%lld misses=%lld expired=%lld duplicates=%lld\n",
+							 (long long) prefetch_info->hits,
+							 (long long) prefetch_info->misses,
+							 (long long) prefetch_info->expired,
+							 (long long) prefetch_info->duplicates);
+	}
+	else
+	{
+		ExplainPropertyInteger("Prefetch Hits", NULL,
+							   prefetch_info->hits, es);
+		ExplainPropertyInteger("Prefetch Misses", NULL,
+							   prefetch_info->misses, es);
+		ExplainPropertyInteger("Prefetch Expired Requests", NULL,
+							   prefetch_info->expired, es);
+		ExplainPropertyInteger("Prefetch Duplicated Requests", NULL,
+							   prefetch_info->duplicates, es);
+	}
+}
+
 /*
  * Show buffer usage details.
  */
diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index 2ff0d691d86..dec294de630 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -403,6 +403,7 @@ get_extension_script_directory(ExtensionControlFile *control)
 {
 	char		sharepath[MAXPGPATH];
 	char	   *result;
+	struct stat fst;
 
 	/*
 	 * The directory parameter can be omitted, absolute, or relative to the
@@ -418,6 +419,16 @@ get_extension_script_directory(ExtensionControlFile *control)
 	result = (char *) palloc(MAXPGPATH);
 	snprintf(result, MAXPGPATH, "%s/%s", sharepath, control->directory);
 
+	// If directory does not exist, check remote extension storage
+	if (stat(result, &fst) < 0)
+	{
+		// request download of extension files from for control->directory
+		if (download_extension_file_hook != NULL)
+		{
+			download_extension_file_hook(control->directory, false);
+		}
+	}
+
 	return result;
 }
 
@@ -1508,6 +1519,13 @@ CreateExtensionInternal(char *extensionName,
 	 * will get us there.
 	 */
 	filename = get_extension_script_filename(pcontrol, NULL, versionName);
+
+	// request download of extension files from compute_ctl
+	if (download_extension_file_hook != NULL)
+	{
+		download_extension_file_hook(extensionName, false);
+	}
+
 	if (stat(filename, &fst) == 0)
 	{
 		/* Easy, no extra scripts */
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index c7e262c0fcc..f14811e63bd 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -54,7 +54,9 @@
  * so we pre-log a few fetches in advance. In the event of
  * crash we can lose (skip over) as many values as we pre-logged.
  */
-#define SEQ_LOG_VALS	32
+/* NEON XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
+/* #define SEQ_LOG_VALS	32 */
+#define SEQ_LOG_VALS	0
 
 /*
  * The "special area" of a sequence's buffer page looks like this.
@@ -355,7 +357,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple)
 	{
 		SMgrRelation srel;
 
-		srel = smgropen(rel->rd_locator, InvalidBackendId);
+		srel = smgropen(rel->rd_locator, InvalidBackendId, rel->rd_rel->relpersistence);
 		smgrcreate(srel, INIT_FORKNUM, false);
 		log_smgrcreate(&rel->rd_locator, INIT_FORKNUM);
 		fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index b089441ff0d..70be9bcb481 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -14787,7 +14787,7 @@ index_copy_data(Relation rel, RelFileLocator newrlocator)
 {
 	SMgrRelation dstrel;
 
-	dstrel = smgropen(newrlocator, rel->rd_backend);
+	dstrel = smgropen(newrlocator, rel->rd_backend, rel->rd_rel->relpersistence);
 
 	/*
 	 * Since we copy the file directly without looking at the shared buffers,
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index ee78a5749d2..01978c79f5c 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -235,6 +235,10 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->local_blks_written += add->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written;
+	dst->prefetch.hits += add->prefetch.hits;
+	dst->prefetch.misses += add->prefetch.misses;
+	dst->prefetch.expired += add->prefetch.expired;
+	dst->prefetch.duplicates += add->prefetch.duplicates;
 	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
 	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
 	INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
@@ -257,6 +261,10 @@ BufferUsageAccumDiff(BufferUsage *dst,
 	dst->local_blks_written += add->local_blks_written - sub->local_blks_written;
 	dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read;
 	dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written;
+	dst->prefetch.hits += add->prefetch.hits - sub->prefetch.hits;
+	dst->prefetch.misses += add->prefetch.misses - sub->prefetch.misses;
+	dst->prefetch.expired += add->prefetch.expired - sub->prefetch.expired;
+	dst->prefetch.duplicates += add->prefetch.duplicates - sub->prefetch.duplicates;
 	INSTR_TIME_ACCUM_DIFF(dst->blk_read_time,
 						  add->blk_read_time, sub->blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index f35df0b8bfb..05c7244292f 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -149,37 +149,21 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				 * multiple processes to iterate jointly.
 				 */
 				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
-#ifdef USE_PREFETCH
-				if (node->prefetch_maximum > 0)
-				{
-					pstate->prefetch_iterator =
-						tbm_prepare_shared_iterate(tbm);
-
-					/*
-					 * We don't need the mutex here as we haven't yet woke up
-					 * others.
-					 */
-					pstate->prefetch_pages = 0;
-					pstate->prefetch_target = -1;
-				}
-#endif
 
 				/* We have initialized the shared state so wake up others. */
 				BitmapDoneInitializingSharedState(pstate);
 			}
+#ifdef USE_PREFETCH
+			node->prefetch_head = 0;
+			node->prefetch_pages = 0;
+			node->prefetch_target = -1;
+#endif
 
 			/* Allocate a private iterator and attach the shared state to it */
 			node->shared_tbmiterator = shared_tbmiterator =
 				tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
 			node->tbmres = tbmres = NULL;
 
-#ifdef USE_PREFETCH
-			if (node->prefetch_maximum > 0)
-			{
-				node->shared_prefetch_iterator =
-					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
-			}
-#endif							/* USE_PREFETCH */
 		}
 		node->initialized = true;
 	}
@@ -196,9 +180,25 @@ BitmapHeapNext(BitmapHeapScanState *node)
 		if (tbmres == NULL)
 		{
 			if (!pstate)
-				node->tbmres = tbmres = tbm_iterate(tbmiterator);
+				tbmres = tbm_iterate(tbmiterator);
 			else
-				node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
+			{
+				if (node->prefetch_pages != 0)
+				{
+					tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_head];
+					node->prefetch_pages -= 1;
+					node->prefetch_head = (node->prefetch_head + 1) % MAX_IO_CONCURRENCY;
+				}
+				else
+					tbmres = tbm_shared_iterate(shared_tbmiterator);
+				if (tbmres)
+				{
+					/* Need to copy result because iterator can be used for prefetch and vocant position in prefetch ring buffer can also be reused */
+					memcpy(&node->tbmres_copy, tbmres, offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmres->ntuples, 0));
+					tbmres = (TBMIterateResult *)&node->tbmres_copy;
+				}
+			}
+			node->tbmres = tbmres;
 			if (tbmres == NULL)
 			{
 				/* no more entries in the bitmap */
@@ -237,7 +237,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				/* AM doesn't think this block is valid, skip */
 				continue;
 			}
-
 			if (tbmres->ntuples >= 0)
 				node->exact_pages++;
 			else
@@ -258,19 +257,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			 * Try to prefetch at least a few pages even before we get to the
 			 * second page if we don't stop reading after the first tuple.
 			 */
-			if (!pstate)
-			{
-				if (node->prefetch_target < node->prefetch_maximum)
-					node->prefetch_target++;
-			}
-			else if (pstate->prefetch_target < node->prefetch_maximum)
-			{
-				/* take spinlock while updating shared state */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_target < node->prefetch_maximum)
-					pstate->prefetch_target++;
-				SpinLockRelease(&pstate->mutex);
-			}
+			if (node->prefetch_target < node->prefetch_maximum)
+				node->prefetch_target++;
 #endif							/* USE_PREFETCH */
 		}
 
@@ -361,54 +349,24 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
 							 TBMIterateResult *tbmres)
 {
 #ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
+	TBMIterator *prefetch_iterator = node->prefetch_iterator;
 
-	if (pstate == NULL)
-	{
-		TBMIterator *prefetch_iterator = node->prefetch_iterator;
-
-		if (node->prefetch_pages > 0)
-		{
-			/* The main iterator has closed the distance by one page */
-			node->prefetch_pages--;
-		}
-		else if (prefetch_iterator)
-		{
-			/* Do not let the prefetch iterator get behind the main one */
-			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
-
-			if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
-				elog(ERROR, "prefetch and main iterators are out of sync");
-		}
+	/* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */
+	if (node->pstate != NULL)
 		return;
-	}
 
-	if (node->prefetch_maximum > 0)
+	if (node->prefetch_pages > 0)
 	{
-		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
-
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_pages > 0)
-		{
-			pstate->prefetch_pages--;
-			SpinLockRelease(&pstate->mutex);
-		}
-		else
-		{
-			/* Release the mutex before iterating */
-			SpinLockRelease(&pstate->mutex);
+		/* The main iterator has closed the distance by one page */
+		node->prefetch_pages--;
+	}
+	else if (prefetch_iterator)
+	{
+		/* Do not let the prefetch iterator get behind the main one */
+		TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
 
-			/*
-			 * In case of shared mode, we can not ensure that the current
-			 * blockno of the main iterator and that of the prefetch iterator
-			 * are same.  It's possible that whatever blockno we are
-			 * prefetching will be processed by another process.  Therefore,
-			 * we don't validate the blockno here as we do in non-parallel
-			 * case.
-			 */
-			if (prefetch_iterator)
-				tbm_shared_iterate(prefetch_iterator);
-		}
+		if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
+			elog(ERROR, "prefetch and main iterators are out of sync");
 	}
 #endif							/* USE_PREFETCH */
 }
@@ -425,35 +383,14 @@ static inline void
 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
 {
 #ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		if (node->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (node->prefetch_target >= node->prefetch_maximum / 2)
-			node->prefetch_target = node->prefetch_maximum;
-		else if (node->prefetch_target > 0)
-			node->prefetch_target *= 2;
-		else
-			node->prefetch_target++;
-		return;
-	}
-
-	/* Do an unlocked check first to save spinlock acquisitions. */
-	if (pstate->prefetch_target < node->prefetch_maximum)
-	{
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
-			pstate->prefetch_target = node->prefetch_maximum;
-		else if (pstate->prefetch_target > 0)
-			pstate->prefetch_target *= 2;
-		else
-			pstate->prefetch_target++;
-		SpinLockRelease(&pstate->mutex);
-	}
+	if (node->prefetch_target >= node->prefetch_maximum)
+		/* don't increase any further */ ;
+	else if (node->prefetch_target >= node->prefetch_maximum / 2)
+		node->prefetch_target = node->prefetch_maximum;
+	else if (node->prefetch_target > 0)
+		node->prefetch_target *= 2;
+	else
+		node->prefetch_target++;
 #endif							/* USE_PREFETCH */
 }
 
@@ -507,56 +444,47 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
 					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
 			}
 		}
-
-		return;
 	}
-
-	if (pstate->prefetch_pages < pstate->prefetch_target)
+	else
 	{
-		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
-
-		if (prefetch_iterator)
+		while (1)
 		{
-			while (1)
-			{
-				TBMIterateResult *tbmpre;
-				bool		do_prefetch = false;
-				bool		skip_fetch;
+			TBMIterateResult *tbmpre;
+			bool		do_prefetch = false;
+			bool		skip_fetch;
 
-				/*
-				 * Recheck under the mutex. If some other process has already
-				 * done enough prefetching then we need not to do anything.
-				 */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_pages < pstate->prefetch_target)
-				{
-					pstate->prefetch_pages++;
-					do_prefetch = true;
-				}
-				SpinLockRelease(&pstate->mutex);
+			if (node->prefetch_pages < node->prefetch_target)
+			{
+				Assert(node->prefetch_pages < MAX_IO_CONCURRENCY);
+				do_prefetch = true;
+			}
 
-				if (!do_prefetch)
-					return;
+			if (!do_prefetch)
+				return;
 
-				tbmpre = tbm_shared_iterate(prefetch_iterator);
-				if (tbmpre == NULL)
-				{
-					/* No more pages to prefetch */
-					tbm_end_shared_iterate(prefetch_iterator);
-					node->shared_prefetch_iterator = NULL;
-					break;
-				}
+			tbmpre = tbm_shared_iterate(node->shared_tbmiterator);
+			if (tbmpre != NULL)
+			{
+				memcpy(&node->prefetch_requests[(node->prefetch_head + node->prefetch_pages) % MAX_IO_CONCURRENCY],
+					   tbmpre,
+					   offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmpre->ntuples, 0));
+				node->prefetch_pages += 1;
+			}
+			else
+			{
+				/* No more pages to prefetch */
+				break;
+			}
 
-				/* As above, skip prefetch if we expect not to need page */
-				skip_fetch = (node->can_skip_fetch &&
-							  (node->tbmres ? !node->tbmres->recheck : false) &&
-							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre->blockno,
-											 &node->pvmbuffer));
+			/* As above, skip prefetch if we expect not to need page */
+			skip_fetch = (node->can_skip_fetch &&
+						  !tbmpre->recheck &&
+						  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
+										 tbmpre->blockno,
+										 &node->pvmbuffer));
 
-				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
-			}
+			if (!skip_fetch)
+				PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
 		}
 	}
 #endif							/* USE_PREFETCH */
@@ -613,8 +541,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 		tbm_end_iterate(node->prefetch_iterator);
 	if (node->shared_tbmiterator)
 		tbm_end_shared_iterate(node->shared_tbmiterator);
-	if (node->shared_prefetch_iterator)
-		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 	if (node->tbm)
 		tbm_free(node->tbm);
 	if (node->vmbuffer != InvalidBuffer)
@@ -627,7 +553,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 	node->prefetch_iterator = NULL;
 	node->initialized = false;
 	node->shared_tbmiterator = NULL;
-	node->shared_prefetch_iterator = NULL;
 	node->vmbuffer = InvalidBuffer;
 	node->pvmbuffer = InvalidBuffer;
 
@@ -683,8 +608,6 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 		tbm_free(node->tbm);
 	if (node->shared_tbmiterator)
 		tbm_end_shared_iterate(node->shared_tbmiterator);
-	if (node->shared_prefetch_iterator)
-		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 	if (node->vmbuffer != InvalidBuffer)
 		ReleaseBuffer(node->vmbuffer);
 	if (node->pvmbuffer != InvalidBuffer)
@@ -739,7 +662,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->pscan_len = 0;
 	scanstate->initialized = false;
 	scanstate->shared_tbmiterator = NULL;
-	scanstate->shared_prefetch_iterator = NULL;
 	scanstate->pstate = NULL;
 
 	/*
@@ -794,8 +716,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	 * Maximum number of prefetches for the tablespace if configured,
 	 * otherwise the current value of the effective_io_concurrency GUC.
 	 */
-	scanstate->prefetch_maximum =
-		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
+	scanstate->prefetch_maximum = get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
 
 	scanstate->ss.ss_currentRelation = currentRelation;
 
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index ed11e8be7fa..ebef569d1d8 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -32,6 +32,7 @@
 
 #include "bootstrap/bootstrap.h"
 #include "common/username.h"
+#include "miscadmin.h"
 #include "port/atomics.h"
 #include "postmaster/postmaster.h"
 #include "storage/spin.h"
@@ -51,6 +52,41 @@ static void init_locale(const char *categoryname, int category, const char *loca
 static void help(const char *progname);
 static void check_root(const char *progname);
 
+typedef int (*MainFunc) (int argc, char *argv[]);
+
+static int
+CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[], bool load_config)
+{
+	MainFunc main_func;
+
+	/*
+	 * Perform just enough initialization that we can load external libraries
+	 */
+	InitStandaloneProcess(argv[0]);
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/* Acquire configuration parameters */
+	if (load_config && !SelectConfigFiles(NULL, progname))
+		exit(1);
+
+	/*
+	 * Imitate we are early in bootstrap loading shared_preload_libraries;
+	 * neon extension sets PGC_POSTMASTER gucs requiring this.
+	 */
+	process_shared_preload_libraries_in_progress = true;
+
+	main_func = load_external_function(library_name, main_func_name, true, NULL);
+
+	process_shared_preload_libraries_in_progress = false;
+
+	return main_func(argc, argv);
+}
 
 /*
  * Any Postgres server process begins execution here.
@@ -194,6 +230,10 @@ main(int argc, char *argv[])
 	else if (argc > 1 && strcmp(argv[1], "--single") == 0)
 		PostgresSingleUserMain(argc, argv,
 							   strdup(get_user_name_or_exit(progname)));
+	else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0)
+		CallExtMain("neon_walredo", "WalRedoMain", argc, argv, false);
+	else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0)
+		CallExtMain("neon", "WalProposerSync", argc, argv, true);
 	else
 		PostmasterMain(argc, argv);
 	/* the functions above should not return */
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index ef475d95a18..964c05be9dc 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -153,6 +153,9 @@ bool		enable_parallel_hash = true;
 bool		enable_partition_pruning = true;
 bool		enable_presorted_aggregate = true;
 bool		enable_async_append = true;
+bool		enable_seqscan_prefetch = true;
+bool		enable_indexscan_prefetch = true;
+bool		enable_indexonlyscan_prefetch = true;
 
 typedef struct
 {
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index feff7094351..63cd3d44d77 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -523,6 +523,13 @@ WalReceiverMain(void)
 				if (endofwal)
 					break;
 
+				/*
+				 * Update WAL statistics, which are produced inside
+				 * issue_xlog_fsync function. This is useful for counting
+				 * WAL flushes, by querying pg_stat_wal.
+				 */
+				pgstat_report_wal(true);
+
 				/* Find the soonest wakeup time, to limit our nap. */
 				nextWakeup = TIMESTAMP_INFINITY;
 				for (int i = 0; i < NUM_WALRCV_WAKEUPS; ++i)
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 4c53de08b9b..4ecfbd4a002 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -54,6 +54,7 @@
 #include "access/transam.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
+#include "access/xloginsert.h"
 #include "access/xlogreader.h"
 #include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
@@ -130,6 +131,11 @@ bool		log_replication_commands = false;
  */
 bool		wake_wal_senders = false;
 
+/*
+ * Backpressure hook, detecting how much we should delay.
+ */
+uint64 (*delay_backend_us)(void) = NULL;
+
 /*
  * xlogreader used for replication.  Note that a WAL sender doing physical
  * replication does not need xlogreader to read WAL, but it needs one to
@@ -253,8 +259,6 @@ static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, Transac
 static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 								 bool skipped_xact);
 static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
-static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time);
-static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now);
 static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch);
 
 static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
@@ -2031,7 +2035,7 @@ ProcessStandbyMessage(void)
 /*
  * Remember that a walreceiver just confirmed receipt of lsn `lsn`.
  */
-static void
+void
 PhysicalConfirmReceivedLocation(XLogRecPtr lsn)
 {
 	bool		changed = false;
@@ -2070,21 +2074,41 @@ ProcessStandbyReplyMessage(void)
 				flushPtr,
 				applyPtr;
 	bool		replyRequested;
-	TimeOffset	writeLag,
-				flushLag,
-				applyLag;
-	bool		clearLagTimes;
-	TimestampTz now;
 	TimestampTz replyTime;
 
-	static bool fullyAppliedLastTime = false;
-
 	/* the caller already consumed the msgtype byte */
 	writePtr = pq_getmsgint64(&reply_message);
 	flushPtr = pq_getmsgint64(&reply_message);
 	applyPtr = pq_getmsgint64(&reply_message);
 	replyTime = pq_getmsgint64(&reply_message);
 	replyRequested = pq_getmsgbyte(&reply_message);
+	ProcessStandbyReply(writePtr,
+						flushPtr,
+						applyPtr,
+						replyTime,
+						replyRequested);
+
+	elog(LOG, "ProcessStandbyReplyMessage: writelsn %X/%X",
+					LSN_FORMAT_ARGS(writePtr));
+	elog(LOG, "ProcessStandbyReplyMessage: flushlsn %X/%X",
+					LSN_FORMAT_ARGS(flushPtr));
+	elog(LOG, "ProcessStandbyReplyMessage: applylsn %X/%X",
+					LSN_FORMAT_ARGS(applyPtr));
+}
+
+void
+ProcessStandbyReply(XLogRecPtr	writePtr,
+					XLogRecPtr	flushPtr,
+					XLogRecPtr	applyPtr,
+					TimestampTz replyTime,
+					bool		replyRequested)
+{
+	TimeOffset	writeLag,
+				flushLag,
+				applyLag;
+	bool		clearLagTimes;
+	TimestampTz now;
+	static bool fullyAppliedLastTime = false;
 
 	if (message_level_is_interesting(DEBUG2))
 	{
@@ -2267,7 +2291,16 @@ ProcessStandbyHSFeedbackMessage(void)
 	feedbackEpoch = pq_getmsgint(&reply_message, 4);
 	feedbackCatalogXmin = pq_getmsgint(&reply_message, 4);
 	feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4);
+	ProcessStandbyHSFeedback(replyTime, feedbackXmin, feedbackEpoch, feedbackCatalogXmin, feedbackCatalogEpoch);
+}
 
+void
+ProcessStandbyHSFeedback(TimestampTz   replyTime,
+						 TransactionId feedbackXmin,
+						 uint32		feedbackEpoch,
+						 TransactionId feedbackCatalogXmin,
+						 uint32		feedbackCatalogEpoch)
+{
 	if (message_level_is_interesting(DEBUG2))
 	{
 		char	   *replyTimeStr;
@@ -2952,9 +2985,12 @@ XLogSendPhysical(void)
 	/*
 	 * OK to read and send the slice.
 	 */
-	resetStringInfo(&output_message);
-	pq_sendbyte(&output_message, 'w');
+	if (output_message.data)
+		resetStringInfo(&output_message);
+	else
+		initStringInfo(&output_message);
 
+	pq_sendbyte(&output_message, 'w');
 	pq_sendint64(&output_message, startptr);	/* dataStart */
 	pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
 	pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
@@ -3140,8 +3176,8 @@ WalSndDone(WalSndSendDataCallback send_data)
 	 * flush location if valid, write otherwise. Tools like pg_receivewal will
 	 * usually (unless in synchronous mode) return an invalid flush location.
 	 */
-	replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ?
-		MyWalSnd->write : MyWalSnd->flush;
+	// XXX Zenith uses flush_lsn to pass extra payload, so use write_lsn here
+	replicatedPtr = MyWalSnd->write;
 
 	if (WalSndCaughtUp && sentPtr == replicatedPtr &&
 		!pq_is_send_pending())
@@ -3753,7 +3789,7 @@ WalSndKeepaliveIfNecessary(void)
  * eventually reported to have been written, flushed and applied by the
  * standby in a reply message.
  */
-static void
+void
 LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time)
 {
 	bool		buffer_full;
@@ -3818,7 +3854,7 @@ LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time)
  * Return -1 if no new sample data is available, and otherwise the elapsed
  * time in microseconds.
  */
-static TimeOffset
+TimeOffset
 LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
 {
 	TimestampTz time = 0;
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 0057443f0c6..f2b4c0e4945 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -24,6 +24,14 @@ ConditionVariableMinimallyPadded *BufferIOCVArray;
 WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
 
+/*
+ * Buffer with target WAL redo page.
+ * We must not evict this page from the buffer pool, but we cannot just keep it pinned because
+ * some WAL redo functions expect the page to not be pinned. So we have a special check in
+ * localbuf.c to prevent this buffer from being evicted.
+ */
+Buffer		wal_redo_buffer;
+bool		am_wal_redo_postgres = false;
 
 /*
  * Data Structures:
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 4343178ff96..999333fb5e8 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -52,6 +52,7 @@
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
+#include "replication/walsender.h"
 #include "utils/memdebug.h"
 #include "utils/ps_status.h"
 #include "utils/rel.h"
@@ -160,6 +161,10 @@ int			checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
 int			bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
 int			backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
 
+/* Evict unpinned pages (for better test coverage) */
+bool		zenith_test_evict = false;
+
+
 /* local state for LockBufferForCleanup */
 static BufferDesc *PinCountWaitBuf = NULL;
 
@@ -798,7 +803,8 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
 {
 	bool		hit;
 
-	SMgrRelation smgr = smgropen(rlocator, InvalidBackendId);
+	SMgrRelation smgr = smgropen(rlocator, InvalidBackendId,
+								 RELPERSISTENCE_PERMANENT);
 
 	return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
 							 RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
@@ -998,7 +1004,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	bool		found;
 	IOContext	io_context;
 	IOObject	io_object;
-	bool		isLocalBuf = SmgrIsTemp(smgr);
+	/*
+	 * wal_redo postgres is working in single user mode, we do not need to
+	 * synchronize access to shared buffer, so let's use local buffers instead.
+	 */
+	bool		isLocalBuf = SmgrIsTemp(smgr) || am_wal_redo_postgres;
 
 	*hit = false;
 
@@ -1993,6 +2003,10 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 			buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
 			buf_block = BufHdrGetBlock(existing_hdr);
 
+			/*
+			 * NEON: In earlier Neon PostgreSQL versions, we zeroed the pages
+			 * and DEBUG1-logged this instead.
+			 */
 			if (valid && !PageIsNew((Page) buf_block))
 				ereport(ERROR,
 						(errmsg("unexpected data beyond EOF in block %u of relation %s",
@@ -2455,6 +2469,32 @@ UnpinBuffer(BufferDesc *buf)
 				UnlockBufHdr(buf, buf_state);
 		}
 		ForgetPrivateRefCountEntry(ref);
+
+		if (zenith_test_evict && !InRecovery)
+		{
+			buf_state = LockBufHdr(buf);
+			if (BUF_STATE_GET_REFCOUNT(buf_state) == 0)
+			{
+				if (buf_state & BM_DIRTY)
+				{
+					ReservePrivateRefCountEntry();
+					PinBuffer_Locked(buf);
+					if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+												 LW_SHARED))
+					{
+						FlushOneBuffer(b);
+						LWLockRelease(BufferDescriptorGetContentLock(buf));
+					}
+					UnpinBuffer(buf);
+				}
+				else
+				{
+					InvalidateBuffer(buf);
+				}
+			}
+			else
+				UnlockBufHdr(buf, buf_state);
+		}
 	}
 }
 
@@ -3373,7 +3413,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 
 	/* Find smgr relation for buffer */
 	if (reln == NULL)
-		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId);
+		reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId, 0);
 
 	TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
 										buf->tag.blockNum,
@@ -4277,7 +4317,8 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
 	use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
 
 	/* Get number of blocks in the source relation. */
-	nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId),
+	nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId,
+								   permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED),
 						  forkNum);
 
 	/* Nothing to copy; just return. */
@@ -4289,8 +4330,9 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
 	 * relation before starting to copy block by block.
 	 */
 	memset(buf.data, 0, BLCKSZ);
-	smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1,
-			   buf.data, true);
+	smgrextend(smgropen(dstlocator, InvalidBackendId,
+						permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED),
+			   forkNum, nblocks - 1, buf.data, true);
 
 	/* This is a bulk operation, so use buffer access strategies. */
 	bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
@@ -4371,9 +4413,9 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator,
 	for (ForkNumber forkNum = MAIN_FORKNUM + 1;
 		 forkNum <= MAX_FORKNUM; forkNum++)
 	{
-		if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum))
+		if (smgrexists(smgropen(src_rlocator, InvalidBackendId, relpersistence), forkNum))
 		{
-			smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false);
+			smgrcreate(smgropen(dst_rlocator, InvalidBackendId, relpersistence), forkNum, false);
 
 			/*
 			 * WAL log creation if the relation is persistent, or this is the
@@ -5562,7 +5604,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
 		i += ahead;
 
 		/* and finally tell the kernel to write the data to storage */
-		reln = smgropen(currlocator, InvalidBackendId);
+		reln = smgropen(currlocator, InvalidBackendId, 0);
 		smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
 	}
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 567b8d15ef0..e7e684c3613 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -18,6 +18,7 @@
 #include "access/parallel.h"
 #include "catalog/catalog.h"
 #include "executor/instrument.h"
+#include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
@@ -25,6 +26,8 @@
 #include "utils/memutils.h"
 #include "utils/resowner_private.h"
 
+/* NEON: prevent eviction of the buffer of target page */
+extern Buffer wal_redo_buffer;
 
 /*#define LBDEBUG*/
 
@@ -198,6 +201,12 @@ GetLocalVictimBuffer(void)
 
 		if (LocalRefCount[victim_bufid] == 0)
 		{
+			if (-victim_bufid - 1 == wal_redo_buffer)
+			{
+				/* ZENITH: Prevent eviction of the buffer with target wal redo page */
+				continue;
+			}
+
 			buf_state = pg_atomic_read_u32(&bufHdr->state);
 
 			if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
@@ -239,7 +248,10 @@ GetLocalVictimBuffer(void)
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
 		/* Find smgr relation for buffer */
-		oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId);
+		if (am_wal_redo_postgres && MyBackendId == InvalidBackendId)
+			oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId, RELPERSISTENCE_PERMANENT);
+		else
+			oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId, RELPERSISTENCE_TEMP);
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c2956..b4652c33ff6 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,4 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+LastWrittenLsnLock					48
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index fdecbad1709..cad698ec65d 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1257,7 +1257,7 @@ DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
 	srels = palloc(sizeof(SMgrRelation) * ndelrels);
 	for (i = 0; i < ndelrels; i++)
 	{
-		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
+		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId, 0);
 
 		if (isRedo)
 		{
@@ -1541,7 +1541,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 int
 mdsyncfiletag(const FileTag *ftag, char *path)
 {
-	SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId);
+	SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId, 0);
 	File		file;
 	instr_time	io_start;
 	bool		need_to_close;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5d0f3d515c3..94a3e7b03b4 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -18,6 +18,7 @@
 #include "postgres.h"
 
 #include "access/xlogutils.h"
+#include "catalog/pg_tablespace.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
@@ -28,69 +29,26 @@
 #include "utils/inval.h"
 
 
-/*
- * This struct of function pointers defines the API between smgr.c and
- * any individual storage manager module.  Note that smgr subfunctions are
- * generally expected to report problems via elog(ERROR).  An exception is
- * that smgr_unlink should use elog(WARNING), rather than erroring out,
- * because we normally unlink relations during post-commit/abort cleanup,
- * and so it's too late to raise an error.  Also, various conditions that
- * would normally be errors should be allowed during bootstrap and/or WAL
- * recovery --- see comments in md.c for details.
- */
-typedef struct f_smgr
-{
-	void		(*smgr_init) (void);	/* may be NULL */
-	void		(*smgr_shutdown) (void);	/* may be NULL */
-	void		(*smgr_open) (SMgrRelation reln);
-	void		(*smgr_close) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_create) (SMgrRelation reln, ForkNumber forknum,
-								bool isRedo);
-	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
-								bool isRedo);
-	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
-								BlockNumber blocknum, const void *buffer, bool skipFsync);
-	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
-									BlockNumber blocknum, int nblocks, bool skipFsync);
-	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
-								  BlockNumber blocknum);
-	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
-							  BlockNumber blocknum, void *buffer);
-	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
-							   BlockNumber blocknum, const void *buffer, bool skipFsync);
-	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
-								   BlockNumber blocknum, BlockNumber nblocks);
-	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
-								  BlockNumber nblocks);
-	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-} f_smgr;
-
-static const f_smgr smgrsw[] = {
+static const f_smgr smgr_md = {
 	/* magnetic disk */
-	{
-		.smgr_init = mdinit,
-		.smgr_shutdown = NULL,
-		.smgr_open = mdopen,
-		.smgr_close = mdclose,
-		.smgr_create = mdcreate,
-		.smgr_exists = mdexists,
-		.smgr_unlink = mdunlink,
-		.smgr_extend = mdextend,
-		.smgr_zeroextend = mdzeroextend,
-		.smgr_prefetch = mdprefetch,
-		.smgr_read = mdread,
-		.smgr_write = mdwrite,
-		.smgr_writeback = mdwriteback,
-		.smgr_nblocks = mdnblocks,
-		.smgr_truncate = mdtruncate,
-		.smgr_immedsync = mdimmedsync,
-	}
+	.smgr_init = mdinit,
+	.smgr_shutdown = NULL,
+	.smgr_open = mdopen,
+	.smgr_close = mdclose,
+	.smgr_create = mdcreate,
+	.smgr_exists = mdexists,
+	.smgr_unlink = mdunlink,
+	.smgr_extend = mdextend,
+	.smgr_zeroextend = mdzeroextend,
+	.smgr_prefetch = mdprefetch,
+	.smgr_read = mdread,
+	.smgr_write = mdwrite,
+	.smgr_writeback = mdwriteback,
+	.smgr_nblocks = mdnblocks,
+	.smgr_truncate = mdtruncate,
+	.smgr_immedsync = mdimmedsync,
 };
 
-static const int NSmgr = lengthof(smgrsw);
-
 /*
  * Each backend has a hashtable that stores all extant SMgrRelation objects.
  * In addition, "unowned" SMgrRelation objects are chained together in a list.
@@ -100,7 +58,7 @@ static HTAB *SMgrRelationHash = NULL;
 static dlist_head unowned_relns;
 
 /* local function prototypes */
-static void smgrshutdown(int code, Datum arg);
+/* static void smgrshutdown(int code, Datum arg); */
 
 
 /*
@@ -114,40 +72,73 @@ static void smgrshutdown(int code, Datum arg);
 void
 smgrinit(void)
 {
-	int			i;
+	(*smgr_init_hook)();
+}
 
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_init)
-			smgrsw[i].smgr_init();
-	}
+/* Hook for plugins to get control in smgr */
+smgr_hook_type smgr_hook = NULL;
+smgr_init_hook_type smgr_init_hook = smgr_init_standard;
+smgr_shutdown_hook_type smgr_shutdown_hook = NULL;
 
-	/* register the shutdown proc */
-	on_proc_exit(smgrshutdown, 0);
+const f_smgr *
+smgr_standard(BackendId backend, RelFileLocator rlocator)
+{
+	return &smgr_md;
 }
 
-/*
- * on_proc_exit hook for smgr cleanup during backend shutdown
- */
-static void
-smgrshutdown(int code, Datum arg)
+///*
+// * TODO: NEON v15- REMOVED this?
+// * on_proc_exit hook for smgr cleanup during backend shutdown
+// */
+//static void
+//smgrshutdown(int code, Datum arg)
+//{
+//	int			i;
+//
+//	for (i = 0; i < NSmgr; i++)
+//	{
+//		if (smgrsw[i].smgr_shutdown)
+//			smgrsw[i].smgr_shutdown();
+//	}
+//}
+
+void
+smgr_init_standard(void)
 {
-	int			i;
+	mdinit();
+}
 
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_shutdown)
-			smgrsw[i].smgr_shutdown();
-	}
+void
+smgr_shutdown_standard(void)
+{
+	/* no-op */
+}
+
+const f_smgr *
+smgr(BackendId backend, RelFileLocator rlocator)
+{
+	const f_smgr *result;
+
+	if (smgr_hook)
+		result = (*smgr_hook)(backend, rlocator);
+	else
+		result = smgr_standard(backend, rlocator);
+
+	return result;
 }
 
 /*
  * smgropen() -- Return an SMgrRelation object, creating it if need be.
  *
  * This does not attempt to actually open the underlying file.
+ *
+ * The caller should pass the value of pg_class.relpersistence, if they know
+ * it, or 0 if unknown. Some operations, like smgrwrite() and smgrunlink()
+ * are allowed when relpersistence is not known, but others like smgrread()
+ * require it.
  */
 SMgrRelation
-smgropen(RelFileLocator rlocator, BackendId backend)
+smgropen(RelFileLocator rlocator, BackendId backend, char relpersistence)
 {
 	RelFileLocatorBackend brlocator;
 	SMgrRelation reln;
@@ -178,16 +169,34 @@ smgropen(RelFileLocator rlocator, BackendId backend)
 		/* hash_search already filled in the lookup key */
 		reln->smgr_owner = NULL;
 		reln->smgr_targblock = InvalidBlockNumber;
+		reln->smgr_relpersistence = relpersistence;
+
 		for (int i = 0; i <= MAX_FORKNUM; ++i)
 			reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
-		reln->smgr_which = 0;	/* we only have md.c at present */
+
+		reln->smgr = smgr(backend, rlocator);
 
 		/* implementation-specific initialization */
-		smgrsw[reln->smgr_which].smgr_open(reln);
+		(*reln->smgr).smgr_open(reln);
 
 		/* it has no owner yet */
 		dlist_push_tail(&unowned_relns, &reln->node);
 	}
+	else
+	{
+		/*
+		 * If the caller passed a valid 'relpersistence', and it was unknown
+		 * before, update it.
+		 */
+		if (reln->smgr_relpersistence == 0)
+			reln->smgr_relpersistence = relpersistence;
+		else
+		{
+			if (!(relpersistence == 0 || reln->smgr_relpersistence == relpersistence))
+				elog(ERROR, "relpersistence mismatch: smgropen %c vs SmgrRelation %c",
+					 relpersistence, reln->smgr_relpersistence);
+		}
+	}
 
 	return reln;
 }
@@ -250,7 +259,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
 bool
 smgrexists(SMgrRelation reln, ForkNumber forknum)
 {
-	return smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
+	return (*reln->smgr).smgr_exists(reln, forknum);
 }
 
 /*
@@ -263,7 +272,7 @@ smgrclose(SMgrRelation reln)
 	ForkNumber	forknum;
 
 	for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		smgrsw[reln->smgr_which].smgr_close(reln, forknum);
+		(*reln->smgr).smgr_close(reln, forknum);
 
 	owner = reln->smgr_owner;
 
@@ -293,7 +302,7 @@ smgrrelease(SMgrRelation reln)
 {
 	for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 	{
-		smgrsw[reln->smgr_which].smgr_close(reln, forknum);
+		(*reln->smgr).smgr_close(reln, forknum);
 		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
 	}
 	reln->smgr_targblock = InvalidBlockNumber;
@@ -373,7 +382,7 @@ smgrcloserellocator(RelFileLocatorBackend rlocator)
 void
 smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 {
-	smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
+	(*reln->smgr).smgr_create(reln, forknum, isRedo);
 }
 
 /*
@@ -401,12 +410,10 @@ smgrdosyncall(SMgrRelation *rels, int nrels)
 	 */
 	for (i = 0; i < nrels; i++)
 	{
-		int			which = rels[i]->smgr_which;
-
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
-			if (smgrsw[which].smgr_exists(rels[i], forknum))
-				smgrsw[which].smgr_immedsync(rels[i], forknum);
+			if ((*rels[i]->smgr).smgr_exists(rels[i], forknum))
+				(*rels[i]->smgr).smgr_immedsync(rels[i], forknum);
 		}
 	}
 }
@@ -445,13 +452,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 	for (i = 0; i < nrels; i++)
 	{
 		RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator;
-		int			which = rels[i]->smgr_which;
 
 		rlocators[i] = rlocator;
 
 		/* Close the forks at smgr level */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			smgrsw[which].smgr_close(rels[i], forknum);
+			(*rels[i]->smgr).smgr_close(rels[i], forknum);
 	}
 
 	/*
@@ -475,10 +481,8 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 
 	for (i = 0; i < nrels; i++)
 	{
-		int			which = rels[i]->smgr_which;
-
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo);
+			(*rels[i]->smgr).smgr_unlink(rlocators[i], forknum, isRedo);
 	}
 
 	pfree(rlocators);
@@ -498,8 +502,7 @@ void
 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   const void *buffer, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
-										 buffer, skipFsync);
+	(*reln->smgr).smgr_extend(reln, forknum, blocknum, buffer, skipFsync);
 
 	/*
 	 * Normally we expect this to increase nblocks by one, but if the cached
@@ -523,8 +526,7 @@ void
 smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			   int nblocks, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
-											 nblocks, skipFsync);
+	(*reln->smgr).smgr_zeroextend(reln, forknum, blocknum, nblocks, skipFsync);
 
 	/*
 	 * Normally we expect this to increase the fork size by nblocks, but if
@@ -547,7 +549,7 @@ smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 bool
 smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
+	return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum);
 }
 
 /*
@@ -562,7 +564,7 @@ void
 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 void *buffer)
 {
-	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
+	(*reln->smgr).smgr_read(reln, forknum, blocknum, buffer);
 }
 
 /*
@@ -584,8 +586,8 @@ void
 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		  const void *buffer, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
-										buffer, skipFsync);
+	(*reln->smgr).smgr_write(reln, forknum, blocknum,
+							 buffer, skipFsync);
 }
 
 
@@ -597,8 +599,7 @@ void
 smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			  BlockNumber nblocks)
 {
-	smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
-											nblocks);
+	(*reln->smgr).smgr_writeback(reln, forknum, blocknum, nblocks);
 }
 
 /*
@@ -615,7 +616,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
 	if (result != InvalidBlockNumber)
 		return result;
 
-	result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+	result = (*reln->smgr).smgr_nblocks(reln, forknum);
 
 	reln->smgr_cached_nblocks[forknum] = result;
 
@@ -681,7 +682,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 		/* Make the cached size is invalid if we encounter an error. */
 		reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
 
-		smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
+		(*reln->smgr).smgr_truncate(reln, forknum[i], nblocks[i]);
 
 		/*
 		 * We might as well update the local smgr_cached_nblocks values. The
@@ -720,9 +721,34 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 void
 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
-	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
+	(*reln->smgr).smgr_immedsync(reln, forknum);
 }
 
+/*
+ * Neon-added functions to mark the phases of an unlogged index build.
+ */
+void
+smgr_start_unlogged_build(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_start_unlogged_build)
+		(*reln->smgr).smgr_start_unlogged_build(reln);
+}
+
+void
+smgr_finish_unlogged_build_phase_1(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_finish_unlogged_build_phase_1)
+		(*reln->smgr).smgr_finish_unlogged_build_phase_1(reln);
+}
+
+void
+smgr_end_unlogged_build(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_end_unlogged_build)
+		(*reln->smgr).smgr_end_unlogged_build(reln);
+}
+
+
 /*
  * AtEOXact_SMgr
  *
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 36cc99ec9cf..a0c3474f291 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -169,6 +169,8 @@ static ProcSignalReason RecoveryConflictReason;
 static MemoryContext row_description_context = NULL;
 static StringInfoData row_description_buf;
 
+process_interrupts_callback_t ProcessInterruptsCallback;
+
 /* ----------------------------------------------------------------
  *		decls for routines only used in this file
  * ----------------------------------------------------------------
@@ -3200,6 +3202,7 @@ ProcessInterrupts(void)
 		return;
 	InterruptPending = false;
 
+retry:
 	if (ProcDiePending)
 	{
 		ProcDiePending = false;
@@ -3447,6 +3450,13 @@ ProcessInterrupts(void)
 
 	if (ParallelApplyMessagePending)
 		HandleParallelApplyMessages();
+
+	/* Call registered callback if any */
+	if (ProcessInterruptsCallback)
+	{
+		if (ProcessInterruptsCallback())
+			goto retry;
+	}
 }
 
 /*
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 7940d646392..de9e71e09e0 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -515,6 +515,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w)
 		case WAIT_EVENT_VACUUM_TRUNCATE:
 			event_name = "VacuumTruncate";
 			break;
+		case WAIT_EVENT_BACK_PRESSURE:
+			event_name = "BackPressure";
+			break;
 			/* no default case, so that compiler will warn */
 	}
 
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index e5c0f1c45b6..39f7107ed9f 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -24,6 +24,7 @@
 #include "commands/tablespace.h"
 #include "miscadmin.h"
 #include "storage/fd.h"
+#include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/numeric.h"
@@ -112,6 +113,8 @@ db_dir_size(const char *path)
 	return dirsize;
 }
 
+dbsize_hook_type dbsize_hook = NULL;
+
 /*
  * calculate size of database in all tablespaces
  */
@@ -141,6 +144,13 @@ calculate_database_size(Oid dbOid)
 
 	/* Include pg_default storage */
 	snprintf(pathname, sizeof(pathname), "base/%u", dbOid);
+
+	if (dbsize_hook)
+	{
+		totalsize = (*dbsize_hook)(dbOid);
+		return totalsize;
+	}
+
 	totalsize = db_dir_size(pathname);
 
 	/* Scan the non-default tablespaces */
@@ -306,41 +316,18 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS)
  * is no check here or at the call sites for that.
  */
 static int64
-calculate_relation_size(RelFileLocator *rfn, BackendId backend, ForkNumber forknum)
+calculate_relation_size(RelFileLocator *rfn, BackendId backend,
+						ForkNumber forknum, char relpersistence)
 {
-	int64		totalsize = 0;
-	char	   *relationpath;
-	char		pathname[MAXPGPATH];
-	unsigned int segcount = 0;
+	SMgrRelation srel = smgropen(*rfn, backend, relpersistence);
 
-	relationpath = relpathbackend(*rfn, backend, forknum);
-
-	for (segcount = 0;; segcount++)
+	if (smgrexists(srel, forknum))
 	{
-		struct stat fst;
-
-		CHECK_FOR_INTERRUPTS();
-
-		if (segcount == 0)
-			snprintf(pathname, MAXPGPATH, "%s",
-					 relationpath);
-		else
-			snprintf(pathname, MAXPGPATH, "%s.%u",
-					 relationpath, segcount);
-
-		if (stat(pathname, &fst) < 0)
-		{
-			if (errno == ENOENT)
-				break;
-			else
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not stat file \"%s\": %m", pathname)));
-		}
-		totalsize += fst.st_size;
+		BlockNumber n = smgrnblocks(srel, forknum);
+		return (int64) n * BLCKSZ;
 	}
 
-	return totalsize;
+	return 0;
 }
 
 Datum
@@ -364,7 +351,8 @@ pg_relation_size(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 
 	size = calculate_relation_size(&(rel->rd_locator), rel->rd_backend,
-								   forkname_to_number(text_to_cstring(forkName)));
+								   forkname_to_number(text_to_cstring(forkName)),
+								   rel->rd_rel->relpersistence);
 
 	relation_close(rel, AccessShareLock);
 
@@ -389,7 +377,8 @@ calculate_toast_table_size(Oid toastrelid)
 	/* toast heap size, including FSM and VM size */
 	for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 		size += calculate_relation_size(&(toastRel->rd_locator),
-										toastRel->rd_backend, forkNum);
+										toastRel->rd_backend, forkNum,
+										toastRel->rd_rel->relpersistence);
 
 	/* toast index size, including FSM and VM size */
 	indexlist = RelationGetIndexList(toastRel);
@@ -403,7 +392,8 @@ calculate_toast_table_size(Oid toastrelid)
 									AccessShareLock);
 		for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 			size += calculate_relation_size(&(toastIdxRel->rd_locator),
-											toastIdxRel->rd_backend, forkNum);
+											toastIdxRel->rd_backend, forkNum,
+											toastIdxRel->rd_rel->relpersistence);
 
 		relation_close(toastIdxRel, AccessShareLock);
 	}
@@ -432,7 +422,7 @@ calculate_table_size(Relation rel)
 	 */
 	for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 		size += calculate_relation_size(&(rel->rd_locator), rel->rd_backend,
-										forkNum);
+										forkNum, rel->rd_rel->relpersistence);
 
 	/*
 	 * Size of toast relation
@@ -472,7 +462,8 @@ calculate_indexes_size(Relation rel)
 			for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 				size += calculate_relation_size(&(idxRel->rd_locator),
 												idxRel->rd_backend,
-												forkNum);
+												forkNum,
+												idxRel->rd_rel->relpersistence);
 
 			relation_close(idxRel, AccessShareLock);
 		}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 8e08ca1c680..6e49533885b 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -3783,7 +3783,7 @@ RelationSetNewRelfilenumber(Relation relation, char persistence)
 		 * fails at this stage, the new cluster will need to be recreated
 		 * anyway.
 		 */
-		srel = smgropen(relation->rd_locator, relation->rd_backend);
+		srel = smgropen(relation->rd_locator, relation->rd_backend, persistence);
 		smgrdounlinkall(&srel, 1, false);
 		smgrclose(srel);
 	}
diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c
index b85d52c913c..2fdfa90e28d 100644
--- a/src/backend/utils/fmgr/dfmgr.c
+++ b/src/backend/utils/fmgr/dfmgr.c
@@ -36,6 +36,7 @@
 #include "storage/shmem.h"
 #include "utils/hsearch.h"
 
+download_extension_file_hook_type download_extension_file_hook = NULL;
 
 /* signature for PostgreSQL-specific library init function */
 typedef void (*PG_init_t) (void);
@@ -79,11 +80,13 @@ static void *internal_load_library(const char *libname);
 static void incompatible_module_error(const char *libname,
 									  const Pg_magic_struct *module_magic_data) pg_attribute_noreturn();
 static bool file_exists(const char *name);
-static char *expand_dynamic_library_name(const char *name);
+static char *expand_dynamic_library_name(const char *name, bool *is_found);
 static void check_restricted_library_name(const char *name);
 static char *substitute_libpath_macro(const char *name);
 static char *find_in_dynamic_libpath(const char *basename);
 
+static void neon_try_load(const char *name);
+
 /* Magic structure that module needs to match to be accepted */
 static const Pg_magic_struct magic_data = PG_MODULE_MAGIC_DATA;
 
@@ -108,9 +111,20 @@ load_external_function(const char *filename, const char *funcname,
 	char	   *fullname;
 	void	   *lib_handle;
 	void	   *retval;
+	bool 		is_found = true;
 
 	/* Expand the possibly-abbreviated filename to an exact path name */
-	fullname = expand_dynamic_library_name(filename);
+	fullname = expand_dynamic_library_name(filename, &is_found);
+
+	// if file is not found, try to download it from compute_ctl
+	if (!is_found && download_extension_file_hook != NULL)
+	{
+		// try to download the file
+		elog(DEBUG3, "load_external_function: try to download file: %s", fullname);
+		neon_try_load(fullname);
+		// try to find file locally once again
+		fullname = expand_dynamic_library_name(filename, &is_found);
+	}
 
 	/* Load the shared library, unless we already did */
 	lib_handle = internal_load_library(fullname);
@@ -132,6 +146,47 @@ load_external_function(const char *filename, const char *funcname,
 	return retval;
 }
 
+void
+neon_try_load(const char *name)
+{
+	bool have_slash;
+	char *request_name;
+
+	// add .so suffix if it is not present
+	if (strstr(name, DLSUFFIX) == NULL)
+	{
+		request_name = psprintf("%s%s", name, DLSUFFIX);
+		elog(DEBUG3, "neon_try_load: add DLSUFFIX: %s", request_name);
+	}
+	else
+	{
+		request_name = pstrdup(name);
+		elog(DEBUG3, "neon_try_load: DLSUFFIX already present: %s", request_name);
+	}
+
+	have_slash = (first_dir_separator(request_name) != NULL);
+
+	if (strncmp(request_name, "$libdir/", strlen("$libdir/")) == 0)
+	{
+		char *new_request_name = psprintf("%s", request_name + strlen("$libdir/"));
+		pfree(request_name);
+		request_name = new_request_name;
+
+		elog(DEBUG3, "neon_try_load: omit $libdir/: %s", request_name);
+	}
+	else if (have_slash)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_NAME),
+					errmsg("unexpected path in dynamic library name: %s",
+						name)));
+	}
+
+	elog(DEBUG3, "neon_try_load: final request_name: %s", request_name);
+
+	download_extension_file_hook(request_name, true);
+}
+
 /*
  * This function loads a shlib file without looking up any particular
  * function in it.  If the same shlib has previously been loaded,
@@ -144,13 +199,24 @@ void
 load_file(const char *filename, bool restricted)
 {
 	char	   *fullname;
+	bool 		is_found = true;
 
 	/* Apply security restriction if requested */
 	if (restricted)
 		check_restricted_library_name(filename);
 
 	/* Expand the possibly-abbreviated filename to an exact path name */
-	fullname = expand_dynamic_library_name(filename);
+	fullname = expand_dynamic_library_name(filename, &is_found);
+
+	// if file is not found, try to download it from compute_ctl
+	if (!is_found && download_extension_file_hook != NULL)
+	{
+		// try to download the file
+		elog(DEBUG3, "load_file: try to download file: %s", fullname);
+		neon_try_load(fullname);
+		// try to find file locally once again
+		fullname = expand_dynamic_library_name(filename, &is_found);
+	}
 
 	/* Load the shared library */
 	(void) internal_load_library(fullname);
@@ -168,7 +234,6 @@ lookup_external_function(void *filehandle, const char *funcname)
 	return dlsym(filehandle, funcname);
 }
 
-
 /*
  * Load the specified dynamic-link library file, unless it already is
  * loaded.  Return the pg_dl* handle for the file.
@@ -209,6 +274,7 @@ internal_load_library(const char *libname)
 					 errmsg("could not access file \"%s\": %m",
 							libname)));
 
+
 		for (file_scanner = file_list;
 			 file_scanner != NULL &&
 			 !SAME_INODE(stat_buf, *file_scanner);
@@ -428,7 +494,7 @@ file_exists(const char *name)
  * The result will always be freshly palloc'd.
  */
 static char *
-expand_dynamic_library_name(const char *name)
+expand_dynamic_library_name(const char *name, bool *is_found)
 {
 	bool		have_slash;
 	char	   *new;
@@ -474,9 +540,11 @@ expand_dynamic_library_name(const char *name)
 	 * If we can't find the file, just return the string as-is. The ensuing
 	 * load attempt will fail and report a suitable message.
 	 */
+	*is_found = false;
 	return pstrdup(name);
 }
 
+
 /*
  * Check a restricted library name.  It must begin with "$libdir/plugins/"
  * and there must not be any directory separators after that (this is
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 0f4ad24d8ec..0a366a9958c 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -31,6 +31,7 @@
 #include "access/toast_compression.h"
 #include "access/twophase.h"
 #include "access/xlog_internal.h"
+#include "access/xloginsert.h"
 #include "access/xlogprefetcher.h"
 #include "access/xlogrecovery.h"
 #include "archive/archive_module.h"
@@ -69,6 +70,7 @@
 #include "storage/large_object.h"
 #include "storage/pg_shmem.h"
 #include "storage/predicate.h"
+#include "storage/smgr.h"
 #include "storage/standby.h"
 #include "tcop/tcopprot.h"
 #include "tsearch/ts_cache.h"
@@ -800,6 +802,36 @@ StaticAssertDecl(lengthof(config_type_names) == (PGC_ENUM + 1),
 
 struct config_bool ConfigureNamesBool[] =
 {
+	{
+		{"enable_seqscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+			gettext_noop("Enables prefetching of next pages in sequential scans."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_seqscan_prefetch,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_indexscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+			gettext_noop("Enables prefetching of heap pages in index scans."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexscan_prefetch,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_indexonlyscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS,
+			gettext_noop("Enables prefetching of leave pages in index-only scans."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&enable_indexonlyscan_prefetch,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of sequential-scan plans."),
@@ -1997,6 +2029,16 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"neon_test_evict", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("Evict unpinned pages (for better test coverage)"),
+		},
+		&zenith_test_evict,
+		false,
+		NULL, NULL, NULL
+	},
+
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
@@ -2263,6 +2305,16 @@ struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"lsn_cache_size", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("Size of last written LSN cache used by Neon."),
+			NULL
+		},
+		&lastWrittenLsnCacheSize,
+		128*1024, 1024, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
 			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
@@ -2811,6 +2863,42 @@ struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"max_replication_apply_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal write lag between master and replicas."),
+			gettext_noop("When lag between minimal apply position of replica and current LSN exceeds this value,"
+						 "backends are blocked."),
+			GUC_UNIT_MB,
+		},
+		&max_replication_apply_lag,
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
+	{
+		{"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal flush lag between master and replicas."),
+			gettext_noop("When lag between minimal flush position of replica and current LSN exceeds this value,"
+						 "backends are blocked"),
+			GUC_UNIT_MB,
+		},
+		&max_replication_flush_lag,
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
+	{
+		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal write lag between master and replicas."),
+			gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value,"
+						 "backends are blocked"),
+			GUC_UNIT_MB,
+		},
+		&max_replication_write_lag,
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING,
 			gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."),
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 8b84e230f1c..764e1e2e0ed 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -3100,6 +3100,7 @@ main(int argc, char *argv[])
 		{"locale-provider", required_argument, NULL, 15},
 		{"icu-locale", required_argument, NULL, 16},
 		{"icu-rules", required_argument, NULL, 17},
+		{"sysid", required_argument, NULL, 18},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -3279,6 +3280,9 @@ main(int argc, char *argv[])
 			case 17:
 				icu_rules = pg_strdup(optarg);
 				break;
+			case 18:
+				boot_options = psprintf("%s -s %s", boot_options, optarg);
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index 96845e1a1ae..221aec84a21 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -29,9 +29,12 @@
 #include "common/logging.h"
 #include "common/relpath.h"
 #include "getopt_long.h"
+#include "port/pg_bitutils.h"
 #include "rmgrdesc.h"
 #include "storage/bufpage.h"
 
+#define OFFSET_INVALID ((size_t)-1)
+
 /*
  * NOTE: For any code change or issue fix here, it is highly recommended to
  * give a thought about doing the same in pg_walinspect contrib module as well.
@@ -50,8 +53,10 @@ typedef struct XLogDumpPrivate
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
 	bool		endptr_reached;
+	char* 		input_filename;
 } XLogDumpPrivate;
 
+
 typedef struct XLogDumpConfig
 {
 	/* display options */
@@ -63,6 +68,8 @@ typedef struct XLogDumpConfig
 	bool		stats;
 	bool		stats_per_record;
 
+	bool		ignore_format_errors;
+
 	/* filter options */
 	bool		filter_by_rmgr[RM_MAX_ID + 1];
 	bool		filter_by_rmgr_enabled;
@@ -94,6 +101,34 @@ sigint_handler(SIGNAL_ARGS)
 }
 #endif
 
+/* calculate ceil(log base 2) of num */
+static int
+my_log2(long num)
+{
+	/*
+	 * guard against too-large input, which would be invalid for
+	 * pg_ceil_log2_*()
+	 */
+	if (num > LONG_MAX / 2)
+		num = LONG_MAX / 2;
+
+#if SIZEOF_LONG < 8
+	return pg_ceil_log2_32(num);
+#else
+	return pg_ceil_log2_64(num);
+#endif
+}
+
+/* calculate first power of 2 >= num, bounded to what will fit in an int */
+static int
+next_pow2_int(long num)
+{
+	if (num > INT_MAX / 2)
+		num = INT_MAX / 2;
+	return 1 << my_log2(num);
+}
+
+
 static void
 print_rmgr_list(void)
 {
@@ -337,6 +372,18 @@ WALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo,
 	TimeLineID	tli = *tli_p;
 	char		fname[MAXPGPATH];
 	int			tries;
+	XLogDumpPrivate *private = state->private_data;
+
+	if(private->input_filename)
+	{
+		Assert(nextSegNo == 0);
+
+		state->seg.ws_file = open_file_in_directory(state->segcxt.ws_dir, private->input_filename);
+		if (state->seg.ws_file >= 0)
+			return;
+
+		pg_fatal("could not open file \"%s\": %m", private->input_filename);
+	}
 
 	XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
 
@@ -407,6 +454,7 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen,
 	{
 		WALOpenSegment *seg = &errinfo.wre_seg;
 		char		fname[MAXPGPATH];
+		char		*actual_fname = private->input_filename ? private->input_filename : fname;
 
 		XLogFileName(fname, seg->ws_tli, seg->ws_segno,
 					 state->segcxt.ws_segsize);
@@ -415,11 +463,11 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen,
 		{
 			errno = errinfo.wre_errno;
 			pg_fatal("could not read from file %s, offset %d: %m",
-					 fname, errinfo.wre_off);
+					 actual_fname, errinfo.wre_off);
 		}
 		else
 			pg_fatal("could not read from file %s, offset %d: read %d of %d",
-					 fname, errinfo.wre_off, errinfo.wre_read,
+					 actual_fname, errinfo.wre_off, errinfo.wre_read,
 					 errinfo.wre_req);
 	}
 
@@ -495,7 +543,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath)
 		char		forkname[FORKNAMECHARS + 2];	/* _ + terminating zero */
 		FILE	   *file;
 		BlockNumber blk;
-		RelFileLocator rnode;
+		RelFileLocator rlocator;
 		ForkNumber	fork;
 
 		if (!XLogRecHasBlockRef(record, block_id))
@@ -511,7 +559,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath)
 			pg_fatal("%s", record->errormsg_buf);
 
 		(void) XLogRecGetBlockTagExtended(record, block_id,
-										  &rnode, &fork, &blk, NULL);
+										  &rlocator, &fork, &blk, NULL);
 
 		if (fork >= 0 && fork <= MAX_FORKNUM)
 			sprintf(forkname, "_%s", forkNames[fork]);
@@ -521,7 +569,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath)
 		snprintf(filename, MAXPGPATH, "%s/%08X-%08X-%08X.%u.%u.%u.%u%s", savepath,
 				 record->seg.ws_tli,
 				 LSN_FORMAT_ARGS(record->ReadRecPtr),
-				 rnode.spcOid, rnode.dbOid, rnode.relNumber, blk, forkname);
+				 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blk, forkname);
 
 		file = fopen(filename, PG_BINARY_W);
 		if (!file)
@@ -547,16 +595,26 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
 	uint32		fpi_len;
 	uint8		info = XLogRecGetInfo(record);
 	XLogRecPtr	xl_prev = XLogRecGetPrev(record);
+	XLogDumpPrivate *private = record->private_data;
 	StringInfoData s;
 
 	XLogRecGetLen(record, &rec_len, &fpi_len);
 
-	printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ",
-		   desc->rm_name,
-		   rec_len, XLogRecGetTotalLen(record),
-		   XLogRecGetXid(record),
-		   LSN_FORMAT_ARGS(record->ReadRecPtr),
-		   LSN_FORMAT_ARGS(xl_prev));
+	if(private->input_filename)
+		printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, offset: 0x%lX, prev %X/%08X, ",
+ 		   desc->rm_name,
+ 		   rec_len, XLogRecGetTotalLen(record),
+ 		   XLogRecGetXid(record),
+		   record->ReadRecPtr,
+ 		   LSN_FORMAT_ARGS(xl_prev));
+	else
+		printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ",
+			desc->rm_name,
+			rec_len, XLogRecGetTotalLen(record),
+			XLogRecGetXid(record),
+			LSN_FORMAT_ARGS(record->ReadRecPtr),
+			LSN_FORMAT_ARGS(xl_prev));
+
 
 	id = desc->rm_identify(info);
 	if (id == NULL)
@@ -762,7 +820,10 @@ usage(void)
 	printf(_("  -f, --follow           keep retrying after reaching end of WAL\n"));
 	printf(_("  -F, --fork=FORK        only show records that modify blocks in fork FORK;\n"
 			 "                         valid names are main, fsm, vm, init\n"));
+	printf(_("  -i, --ignore           ignore format errors, skip invalid structures\n"));
+	printf(_("  -N, --file=FNAME       dump log records from a single file\n"));
 	printf(_("  -n, --limit=N          number of records to display\n"));
+	printf(_("  -o, --offset=OFFSET    offset of the first record to in a file to dump\n"));
 	printf(_("  -p, --path=PATH        directory in which to find WAL segment files or a\n"
 			 "                         directory with a ./pg_wal that contains such files\n"
 			 "                         (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n"));
@@ -797,6 +858,9 @@ main(int argc, char **argv)
 	XLogRecPtr	first_record;
 	char	   *waldir = NULL;
 	char	   *errormsg;
+	char	   *fName = NULL;
+	bool 		single_file = false;
+	size_t		start_offset = OFFSET_INVALID;
 
 	static struct option long_options[] = {
 		{"bkp-details", no_argument, NULL, 'b'},
@@ -804,6 +868,9 @@ main(int argc, char **argv)
 		{"end", required_argument, NULL, 'e'},
 		{"follow", no_argument, NULL, 'f'},
 		{"fork", required_argument, NULL, 'F'},
+		{"file", required_argument, NULL, 'N'},
+		{"ignore", no_argument, NULL, 'i'},
+		{"offset", required_argument, NULL, 'o'},
 		{"fullpage", no_argument, NULL, 'w'},
 		{"help", no_argument, NULL, '?'},
 		{"limit", required_argument, NULL, 'n'},
@@ -853,6 +920,7 @@ main(int argc, char **argv)
 	private.startptr = InvalidXLogRecPtr;
 	private.endptr = InvalidXLogRecPtr;
 	private.endptr_reached = false;
+	private.input_filename = NULL;
 
 	config.quiet = false;
 	config.bkp_details = false;
@@ -871,6 +939,7 @@ main(int argc, char **argv)
 	config.save_fullpage_path = NULL;
 	config.stats = false;
 	config.stats_per_record = false;
+	config.ignore_format_errors = false;
 
 	stats.startptr = InvalidXLogRecPtr;
 	stats.endptr = InvalidXLogRecPtr;
@@ -881,7 +950,7 @@ main(int argc, char **argv)
 		goto bad_argument;
 	}
 
-	while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wx:z",
+	while ((option = getopt_long(argc, argv, "bB:e:fF:in:N:o:p:qr:R:s:t:wx:z",
 								 long_options, &optindex)) != -1)
 	{
 		switch (option)
@@ -920,6 +989,13 @@ main(int argc, char **argv)
 				}
 				config.filter_by_extended = true;
 				break;
+			case 'N':
+				fName = pg_strdup(optarg);
+				single_file = true;
+				break;
+			case 'i':
+				config.ignore_format_errors = true;
+				break;
 			case 'n':
 				if (sscanf(optarg, "%d", &config.stop_after_records) != 1)
 				{
@@ -927,6 +1003,13 @@ main(int argc, char **argv)
 					goto bad_argument;
 				}
 				break;
+			case 'o':
+				if (sscanf(optarg, "%zu", &start_offset) != 1)
+				{
+					pg_log_error("could not parse offset \"%s\"", optarg);
+					goto bad_argument;
+				}
+				break;
 			case 'p':
 				waldir = pg_strdup(optarg);
 				break;
@@ -1092,6 +1175,73 @@ main(int argc, char **argv)
 		goto bad_argument;
 	}
 
+	if (start_offset != OFFSET_INVALID)
+	{
+		if(!XLogRecPtrIsInvalid(private.startptr) || !XLogRecPtrIsInvalid(private.endptr))
+		{
+			pg_log_error("either file offset or start/end pointers should be specified");
+			goto bad_argument;
+		}
+
+		if(!single_file)
+		{
+			pg_log_error("offset option could only be used with filename option");
+			goto bad_argument;
+		}
+
+		/* Log records are maxaligned, start at the closest next position */
+		private.startptr = MAXALIGN(start_offset);
+	}
+
+	if(single_file)
+	{
+		char	   *directory = NULL;
+		int			fd;
+		struct stat stat;
+
+		if(config.follow)
+		{
+			pg_log_error("Follow could not be used in file dump mode");
+			goto bad_argument;
+		}
+
+		if (waldir != NULL)
+		{
+			pg_log_error("either single file or wal directory should be specified");
+			goto bad_argument;
+		}
+
+		split_path(fName, &directory, &private.input_filename);
+		waldir = directory;
+
+		if(waldir == NULL)
+		{
+			char *cwd = malloc(MAXPGPATH);
+
+			if (!getcwd(cwd, MAXPGPATH))
+				pg_fatal("could identify current directory: %m");
+
+			waldir = cwd;
+		}
+
+		if (!verify_directory(waldir))
+			pg_fatal("could not open directory \"%s\": %m", waldir);
+
+		fd = open_file_in_directory(waldir, private.input_filename);
+		if (fd < 0)
+			pg_fatal("could not open file \"%s\"", private.input_filename);
+
+		if(fstat(fd, &stat) != 0)
+			pg_fatal("could not stat file \"%s\"", private.input_filename);
+
+		private.endptr = stat.st_size;
+
+		/* Round up segment size to next power of 2 or 1MB */
+		WalSegSz = Max(next_pow2_int(private.endptr), 1024 * 1024);
+
+		close(fd);
+	}
+
 	if (waldir != NULL)
 	{
 		/* validate path points to directory */
@@ -1113,6 +1263,12 @@ main(int argc, char **argv)
 		int			fd;
 		XLogSegNo	segno;
 
+		if(single_file)
+		{
+			pg_log_error("either single file or start/end boundaries should be specified");
+			goto bad_argument;
+		}
+
 		split_path(argv[optind], &directory, &fname);
 
 		if (waldir == NULL && directory != NULL)
@@ -1185,10 +1341,11 @@ main(int argc, char **argv)
 		}
 	}
 	else
-		waldir = identify_target_directory(waldir, NULL);
+		if (!single_file)
+			waldir = identify_target_directory(waldir, NULL);
 
 	/* we don't know what to print */
-	if (XLogRecPtrIsInvalid(private.startptr))
+	if (XLogRecPtrIsInvalid(private.startptr) && !single_file)
 	{
 		pg_log_error("no start WAL location given");
 		goto bad_argument;
@@ -1206,12 +1363,27 @@ main(int argc, char **argv)
 	if (!xlogreader_state)
 		pg_fatal("out of memory while allocating a WAL reading processor");
 
-	/* first find a valid recptr to start from */
-	first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
+	if(single_file)
+	{
+		if(config.ignore_format_errors)
+		{
+			xlogreader_state->skip_page_validation = true;
+			xlogreader_state->skip_invalid_records = true;
+		}
+
+		xlogreader_state->skip_lsn_checks = true;
+		first_record = private.startptr;
+		XLogBeginRead(xlogreader_state, first_record);
+	}
+	else
+	{
+		/* first find a valid recptr to start from */
+		first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
 
-	if (first_record == InvalidXLogRecPtr)
-		pg_fatal("could not find a valid record after %X/%X",
-				 LSN_FORMAT_ARGS(private.startptr));
+		if (first_record == InvalidXLogRecPtr)
+			pg_fatal("could not find a valid record after %X/%X",
+					 LSN_FORMAT_ARGS(private.startptr));
+	}
 
 	/*
 	 * Display a message that we're skipping data if `from` wasn't a pointer
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index faf50265191..bab88d9c01e 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -72,6 +72,10 @@ typedef struct HeapScanDescData
 	 */
 	ParallelBlockTableScanWorkerData *rs_parallelworkerdata;
 
+	/* prefetch info */
+	int			rs_prefetch_maximum; /* io_concurrency of tablespace */
+	int			rs_prefetch_target; /* current readahead target */
+
 	/* these fields only used in page-at-a-time mode and for bitmap scans */
 	int			rs_cindex;		/* current tuple's index in vistuples */
 	int			rs_ntuples;		/* number of visible tuples on page */
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9020abebc92..6a50f7a03ba 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1076,6 +1076,22 @@ typedef struct BTScanOpaqueData
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
+
+	/* Neon: prefetch state */
+	int         prefetch_maximum; /* maximal number of prefetch requests */
+
+	/* Prefech of referenced heap pages for index scan */
+	/* To minimize waste prefetch requests we start with prefetch distance 0
+	 * and increase it until it reaches prefetch_maximum
+	 */
+	int         current_prefetch_distance;
+
+	/* Prefetch of leave pages of B-Tree for index-only scan */
+	int         n_prefetch_requests; /* number of active prefetch requests */
+	int         n_prefetch_blocks; /* number of elements in prefetch_blocks */
+	int         last_prefetch_index; /* current position in prefetch_blocks (prefetch_blocks[0..last_prefetch_index] are already requested */
+	BlockNumber next_parent; /* pointer to next parent page */
+	BlockNumber prefetch_blocks[MaxTIDsPerBTreePage + 1]; /* leaves + parent page */
 } BTScanOpaqueData;
 
 typedef BTScanOpaqueData *BTScanOpaque;
@@ -1242,6 +1258,7 @@ extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
+							   BlockNumber* parent,
 							   Snapshot snapshot);
 
 /*
diff --git a/src/include/access/neon_xlog.h b/src/include/access/neon_xlog.h
new file mode 100644
index 00000000000..465e1752b95
--- /dev/null
+++ b/src/include/access/neon_xlog.h
@@ -0,0 +1,132 @@
+#ifndef NEON_XLOG_H
+#define NEON_XLOG_H
+
+/*
+ * The RMGR id of the Neon RMGR
+ *
+ * Reserved at https://wiki.postgresql.org/wiki/CustomWALResourceManagers
+ */
+#define RM_NEON_ID 134
+
+#define XLOG_NEON_INIT_PAGE			0x80
+#define XLOG_NEON_OPMASK			((~XLOG_NEON_INIT_PAGE) & XLR_RMGR_INFO_MASK)
+
+/* from XLOG_HEAP_* */
+#define XLOG_NEON_HEAP_INSERT		0x00
+#define XLOG_NEON_HEAP_DELETE		0x10
+#define XLOG_NEON_HEAP_UPDATE		0x20
+#define XLOG_NEON_HEAP_HOT_UPDATE	0x30
+#define XLOG_NEON_HEAP_LOCK			0x40
+/* from XLOG_HEAP2_* */
+#define XLOG_NEON_HEAP_MULTI_INSERT	0x50
+/* 2 variants available */
+
+/* */
+typedef struct xl_neon_heap_header {
+	uint16		t_infomask2;
+	uint16		t_infomask;
+	uint32		t_cid;
+	uint8		t_hoff;
+} xl_neon_heap_header;
+
+#define SizeOfNeonHeapHeader	(offsetof(xl_neon_heap_header, t_hoff) + sizeof(uint8))
+
+/* This is what we need to know about insert */
+typedef struct xl_neon_heap_insert
+{
+	OffsetNumber offnum;		/* inserted tuple's offset */
+	uint8		flags;
+
+	/* xl_neon_heap_header & TUPLE DATA in backup block 0 */
+} xl_neon_heap_insert;
+
+#define SizeOfNeonHeapInsert	(offsetof(xl_neon_heap_insert, flags) + sizeof(uint8))
+
+/* This is what we need to know about delete */
+typedef struct xl_neon_heap_delete
+{
+	TransactionId xmax;			/* xmax of the deleted tuple */
+	OffsetNumber offnum;		/* deleted tuple's offset */
+	uint8		infobits_set;	/* infomask bits */
+	uint8		flags;
+	uint32		t_cid;
+} xl_neon_heap_delete;
+
+#define SizeOfNeonHeapDelete	(offsetof(xl_neon_heap_delete, t_cid) + sizeof(uint32))
+
+/*
+ * This is what we need to know about update|hot_update
+ *
+ * Backup blk 0: new page
+ *
+ * If XLH_UPDATE_PREFIX_FROM_OLD or XLH_UPDATE_SUFFIX_FROM_OLD flags are set,
+ * the prefix and/or suffix come first, as one or two uint16s.
+ *
+ * After that, xl_neon_heap_header and new tuple data follow.  The new tuple
+ * data doesn't include the prefix and suffix, which are copied from the
+ * old tuple on replay.
+ *
+ * If XLH_UPDATE_CONTAINS_NEW_TUPLE flag is given, the tuple data is
+ * included even if a full-page image was taken.
+ *
+ * Backup blk 1: old page, if different. (no data, just a reference to the blk)
+ */
+typedef struct xl_neon_heap_update
+{
+	TransactionId old_xmax;		/* xmax of the old tuple */
+	OffsetNumber old_offnum;	/* old tuple's offset */
+	uint8		old_infobits_set;	/* infomask bits to set on old tuple */
+	uint8		flags;
+	uint32		t_cid;
+	TransactionId new_xmax;		/* xmax of the new tuple */
+	OffsetNumber new_offnum;	/* new tuple's offset */
+
+	/*
+	 * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags
+	 * are set, xl_neon_heap_header and tuple data for the old tuple follow.
+	 */
+} xl_neon_heap_update;
+#define SizeOfNeonHeapUpdate	(offsetof(xl_neon_heap_update, new_offnum) + sizeof(OffsetNumber))
+
+typedef struct xl_neon_heap_lock
+{
+	TransactionId xmax;			/* might be a MultiXactId */
+	uint32		t_cid;
+	OffsetNumber offnum;		/* locked tuple's offset on page */
+	uint8		infobits_set;	/* infomask and infomask2 bits to set */
+	uint8		flags;			/* XLH_LOCK_* flag bits */
+} xl_neon_heap_lock;
+#define SizeOfNeonHeapLock	(offsetof(xl_neon_heap_lock, flags) + sizeof(uint8))
+
+/*
+ * This is what we need to know about a multi-insert.
+ *
+ * The main data of the record consists of this xl_neon_heap_multi_insert header.
+ * 'offsets' array is omitted if the whole page is reinitialized
+ * (XLOG_HEAP_INIT_PAGE).
+ *
+ * In block 0's data portion, there is an xl_neon_multi_insert_tuple struct,
+ * followed by the tuple data for each tuple. There is padding to align
+ * each xl_neon_multi_insert_tuple struct.
+ */
+typedef struct xl_neon_heap_multi_insert
+{
+	uint8		flags;
+	uint16		ntuples;
+	uint32		t_cid;
+	OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+} xl_neon_heap_multi_insert;
+
+#define SizeOfNeonHeapMultiInsert	offsetof(xl_neon_heap_multi_insert, offsets)
+
+typedef struct xl_neon_multi_insert_tuple
+{
+	uint16		datalen;		/* size of tuple data that follows */
+	uint16		t_infomask2;
+	uint16		t_infomask;
+	uint8		t_hoff;
+	/* TUPLE DATA FOLLOWS AT END OF STRUCT */
+} xl_neon_multi_insert_tuple;
+#define SizeOfNeonMultiInsertTuple	(offsetof(xl_neon_multi_insert_tuple, t_hoff) + sizeof(uint8))
+
+#endif //NEON_XLOG_H
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 48ca8523810..7643bb0c85f 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -16,6 +16,8 @@
 #include "datatype/timestamp.h"
 #include "lib/stringinfo.h"
 #include "nodes/pg_list.h"
+#include "storage/block.h"
+#include "storage/relfilelocator.h"
 
 
 /* Sync methods */
@@ -30,6 +32,15 @@ extern PGDLLIMPORT XLogRecPtr ProcLastRecPtr;
 extern PGDLLIMPORT XLogRecPtr XactLastRecEnd;
 extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd;
 
+/*
+ * Pseudo block number used to associate LSN with relation metadata (relation size)
+ */
+#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber
+
+extern bool			ZenithRecoveryRequested;
+extern XLogRecPtr	zenithLastRec;
+extern bool			zenithWriteOk;
+
 /* these variables are GUC parameters related to XLOG */
 extern PGDLLIMPORT int wal_segment_size;
 extern PGDLLIMPORT int min_wal_size_mb;
@@ -53,6 +64,8 @@ extern PGDLLIMPORT bool track_wal_io_timing;
 extern PGDLLIMPORT int wal_decode_buffer_size;
 
 extern PGDLLIMPORT int CheckPointSegments;
+extern int  lastWrittenLsnCacheSize;
+
 
 /* Archive modes */
 typedef enum ArchiveMode
@@ -246,6 +259,21 @@ extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
 extern TimeLineID GetWALInsertionTimeLine(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 
+/* neon specifics */
+
+extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum, BlockNumber blkno);
+extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks);
+extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn);
+extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileLocator relfilenode, ForkNumber forknum);
+extern XLogRecPtr GetLastWrittenLSN(RelFileLocator relfilenode, ForkNumber forknum, BlockNumber blkno);
+
+extern void SetRedoStartLsn(XLogRecPtr RedoStartLSN);
+extern XLogRecPtr GetRedoStartLsn(void);
+
+extern void SetZenithCurrentClusterSize(uint64 size);
+extern uint64 GetZenithCurrentClusterSize(void);
+
+
 extern void SetWalWriterSleeping(bool sleeping);
 
 /*
@@ -296,6 +324,8 @@ extern SessionBackupState get_backup_status(void);
 #define TABLESPACE_MAP			"tablespace_map"
 #define TABLESPACE_MAP_OLD		"tablespace_map.old"
 
+#define ZENITH_SIGNAL_FILE		"zenith.signal"
+
 /* files to signal promotion to primary */
 #define PROMOTE_SIGNAL_FILE		"promote"
 
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index 31785dc578f..e29c27345ce 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -38,6 +38,10 @@
 #define REGBUF_KEEP_DATA	0x10	/* include data even if a full-page image
 									 * is taken */
 
+extern int max_replication_apply_lag;
+extern int max_replication_flush_lag;
+extern int max_replication_write_lag;
+
 /* prototypes for public functions in xloginsert.c: */
 extern void XLogBeginInsert(void);
 extern void XLogSetRecordFlags(uint8 flags);
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index da32c7db772..35c25054f17 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -216,6 +216,10 @@ struct XLogReaderState
 	/* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */
 	XLogRecPtr	overwrittenRecPtr;
 
+	/* Disable validation to allow dumping corrupt WAL */
+	bool skip_page_validation;
+	bool skip_invalid_records;
+	bool skip_lsn_checks;
 
 	/* ----------------------------------------
 	 * Decoded representation of current record
@@ -441,4 +445,7 @@ extern bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id,
 									   BlockNumber *blknum,
 									   Buffer *prefetch_buffer);
 
+extern DecodedXLogRecord *
+XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized);
+
 #endif							/* XLOGREADER_H */
diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h
index 47c29350f5d..23080f8e087 100644
--- a/src/include/access/xlogrecovery.h
+++ b/src/include/access/xlogrecovery.h
@@ -136,6 +136,7 @@ extern void ShutdownWalRecovery(void);
 extern void RemovePromoteSignalFiles(void);
 
 extern bool HotStandbyActive(void);
+extern void XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr);
 extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
 extern RecoveryPauseState GetRecoveryPauseState(void);
 extern void SetRecoveryPause(bool recoveryPause);
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 5b77b11f508..153ab3e1398 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -81,6 +81,12 @@ typedef struct ReadLocalXLogPageNoWaitPrivate
 	bool		end_of_wal;		/* true, when end of WAL is reached */
 } ReadLocalXLogPageNoWaitPrivate;
 
+/*
+ * Returns true if we shouldn't do REDO on that block in record indicated by
+ * block_id; false otherwise.
+ */
+extern bool	(*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record,
 											uint8 block_id, Buffer *buf);
 extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id);
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index 3d3e632a0cc..e17e10b1ea4 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -46,6 +46,7 @@ typedef struct ExplainState
 	bool		timing;			/* print detailed node timing */
 	bool		summary;		/* print total planning and execution timing */
 	bool		settings;		/* print modified settings */
+	bool		prefetch;		/* print prefetch statistic */
 	bool		generic;		/* generate a generic plan */
 	ExplainFormat format;		/* output format */
 	/* state for output formatting --- not reset for each new plan tree */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 87e5e2183bd..3164c04dacc 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -15,6 +15,14 @@
 
 #include "portability/instr_time.h"
 
+/* Prefeth statistics */
+typedef struct
+{
+	int64 hits;
+	int64 misses;
+	int64 expired;
+	int64 duplicates;
+} PrefetchInfo;
 
 /*
  * BufferUsage and WalUsage counters keep being incremented infinitely,
@@ -37,6 +45,7 @@ typedef struct BufferUsage
 	instr_time	blk_write_time; /* time spent writing blocks */
 	instr_time	temp_blk_read_time; /* time spent reading temp blocks */
 	instr_time	temp_blk_write_time;	/* time spent writing temp blocks */
+	PrefetchInfo prefetch; /* prefetch statistics */ 
 } BufferUsage;
 
 /*
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index b120f5e7fef..be711d2fcfc 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -797,4 +797,10 @@ extern PGDLLIMPORT fmgr_hook_type fmgr_hook;
 #define FmgrHookIsNeeded(fn_oid)							\
 	(!needs_fmgr_hook ? false : (*needs_fmgr_hook)(fn_oid))
 
+
+
+// download_extension_file_hook (filename, is_library)
+typedef bool (*download_extension_file_hook_type) (const char *, bool);
+extern PGDLLIMPORT download_extension_file_hook_type download_extension_file_hook;
+
 #endif							/* FMGR_H */
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 14bd574fc24..95d19a761fc 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -107,6 +107,10 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount;
 /* in tcop/postgres.c */
 extern void ProcessInterrupts(void);
 
+/* Callback called by ProcessInterrupts in the loop while it is returning true. */
+typedef bool (*process_interrupts_callback_t)(void);
+extern process_interrupts_callback_t ProcessInterruptsCallback;
+
 /* Test whether an interrupt is pending */
 #ifndef WIN32
 #define INTERRUPTS_PENDING_CONDITION() \
@@ -504,4 +508,7 @@ extern void RestoreClientConnectionInfo(char *conninfo);
 /* in executor/nodeHash.c */
 extern size_t get_hash_memory_limit(void);
 
+/* in storage/buffer/buf_init.c */
+extern bool am_wal_redo_postgres;
+
 #endif							/* MISCADMIN_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 869465d6f80..c467dbf8d70 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -38,6 +38,7 @@
 #include "nodes/plannodes.h"
 #include "nodes/tidbitmap.h"
 #include "partitioning/partdefs.h"
+#include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "utils/hsearch.h"
 #include "utils/queryenvironment.h"
@@ -1698,6 +1699,15 @@ typedef struct ParallelBitmapHeapState
 	char		phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
 } ParallelBitmapHeapState;
 
+typedef struct TBMIteratePrefetchResult
+{
+	BlockNumber blockno;		/* page number containing tuples */
+	int			ntuples;		/* -1 indicates lossy result */
+	bool		recheck;		/* should the tuples be rechecked? */
+	/* Note: recheck is always true if ntuples < 0 */
+	OffsetNumber offsets[MaxHeapTuplesPerPage];
+} TBMIteratePrefetchResult;
+
 /* ----------------
  *	 BitmapHeapScanState information
  *
@@ -1718,7 +1728,6 @@ typedef struct ParallelBitmapHeapState
  *		pscan_len		   size of the shared memory for parallel bitmap
  *		initialized		   is node is ready to iterate
  *		shared_tbmiterator	   shared iterator
- *		shared_prefetch_iterator shared iterator for prefetching
  *		pstate			   shared state for parallel bitmap scan
  * ----------------
  */
@@ -1742,7 +1751,10 @@ typedef struct BitmapHeapScanState
 	Size		pscan_len;
 	bool		initialized;
 	TBMSharedIterator *shared_tbmiterator;
-	TBMSharedIterator *shared_prefetch_iterator;
+	/* parallel worker private ring buffer with prefetch requests: it allows to access prefetch result from the same worker */
+	TBMIteratePrefetchResult prefetch_requests[MAX_IO_CONCURRENCY];
+	TBMIteratePrefetchResult tbmres_copy; /* copy of current iterator result */
+	int prefetch_head; /* head position in ring buffer */
 	ParallelBitmapHeapState *pstate;
 } BitmapHeapScanState;
 
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 6cf49705d3a..7b17d328fec 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -70,6 +70,10 @@ extern PGDLLIMPORT bool enable_parallel_hash;
 extern PGDLLIMPORT bool enable_partition_pruning;
 extern PGDLLIMPORT bool enable_presorted_aggregate;
 extern PGDLLIMPORT bool enable_async_append;
+extern PGDLLIMPORT bool enable_seqscan_prefetch;
+extern PGDLLIMPORT bool enable_indexscan_prefetch;
+extern PGDLLIMPORT bool enable_indexonlyscan_prefetch;
+
 extern PGDLLIMPORT int constraint_exclusion;
 
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 6d572c38204..597e23a7963 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -277,6 +277,9 @@
 /* Define if you have a function readline library */
 #undef HAVE_LIBREADLINE
 
+/* Define to 1 if you have the `seccomp' library (-lseccomp). */
+#undef HAVE_LIBSECCOMP
+
 /* Define to 1 if you have the `selinux' library (-lselinux). */
 #undef HAVE_LIBSELINUX
 
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index a1a93ad706e..2870e31d087 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -57,7 +57,7 @@
  * version.  Example: "ACME Postgres/1.2".  Note that the string will appear
  * in a user-facing error message if an ABI mismatch is detected.
  */
-#define FMGR_ABI_EXTRA		"PostgreSQL"
+#define FMGR_ABI_EXTRA		"Neon Postgres"
 
 /*
  * Maximum number of columns in an index.  There is little point in making
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 9df7e50f943..87447c14afa 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -12,6 +12,7 @@
 #ifndef _WALSENDER_H
 #define _WALSENDER_H
 
+#include "access/xlog.h"
 #include <signal.h>
 
 /*
@@ -48,6 +49,25 @@ extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
 
+/*
+ * Hook to check for WAL receiving backpressure.
+ * Return value in microseconds
+ */
+extern uint64 (*delay_backend_us)(void);
+
+/* expose these so that they can be reused by the neon walproposer extension */
+extern void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time);
+extern TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now);
+extern void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr,
+								XLogRecPtr applyPtr, TimestampTz replyTime,
+								bool replyRequested);
+extern void PhysicalConfirmReceivedLocation(XLogRecPtr lsn);
+extern void ProcessStandbyHSFeedback(TimestampTz   replyTime,
+									 TransactionId feedbackXmin,
+									 uint32		feedbackEpoch,
+									 TransactionId feedbackCatalogXmin,
+									 uint32		feedbackCatalogEpoch);
+
 /*
  * Remember that we want to wakeup walsenders later
  *
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index bc79a329a1a..96d92de318c 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -310,6 +310,8 @@ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
 extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
 
+extern Buffer wal_redo_buffer;
+
 /* in localbuf.c */
 extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index b379c76e273..6727de5d0a0 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -137,6 +137,8 @@ extern PGDLLIMPORT int checkpoint_flush_after;
 extern PGDLLIMPORT int backend_flush_after;
 extern PGDLLIMPORT int bgwriter_flush_after;
 
+extern bool	zenith_test_evict;
+
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
 
@@ -145,8 +147,8 @@ extern PGDLLIMPORT int NLocBuffer;
 extern PGDLLIMPORT Block *LocalBufferBlockPointers;
 extern PGDLLIMPORT int32 *LocalRefCount;
 
-/* upper limit for effective_io_concurrency */
-#define MAX_IO_CONCURRENCY 1000
+/* upper limit for effective_io_concurrency (better to he power of 2) */
+#define MAX_IO_CONCURRENCY 1024
 
 /* special block number for ReadBuffer() */
 #define P_NEW	InvalidBlockNumber	/* grow the file to get a new page */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a9a179aabac..d6158a0d067 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,6 +18,14 @@
 #include "storage/block.h"
 #include "storage/relfilelocator.h"
 
+struct f_smgr;
+
+/*
+ * Neon: extended SMGR API.
+ * This define can be used by extensions to determine that them are built for Neon.
+ */
+#define NEON_SMGR 1
+
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
  * cached file handles.  An SMgrRelation is created (if not already present)
@@ -41,6 +49,9 @@ typedef struct SMgrRelationData
 	/* rlocator is the hashtable lookup key, so it must be first! */
 	RelFileLocatorBackend smgr_rlocator;	/* relation physical identifier */
 
+	/* copy of pg_class.relpersistence, or 0 if not known */
+	char		smgr_relpersistence;
+
 	/* pointer to owning pointer, or NULL if none */
 	struct SMgrRelationData **smgr_owner;
 
@@ -59,7 +70,7 @@ typedef struct SMgrRelationData
 	 * Fields below here are intended to be private to smgr.c and its
 	 * submodules.  Do not touch them from elsewhere.
 	 */
-	int			smgr_which;		/* storage manager selector */
+	const struct f_smgr *smgr;
 
 	/*
 	 * for md.c; per-fork arrays of the number of open segments
@@ -77,8 +88,70 @@ typedef SMgrRelationData *SMgrRelation;
 #define SmgrIsTemp(smgr) \
 	RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator)
 
+
+/*
+ * This struct of function pointers defines the API between smgr.c and
+ * any individual storage manager module.  Note that smgr subfunctions are
+ * generally expected to report problems via elog(ERROR).  An exception is
+ * that smgr_unlink should use elog(WARNING), rather than erroring out,
+ * because we normally unlink relations during post-commit/abort cleanup,
+ * and so it's too late to raise an error.  Also, various conditions that
+ * would normally be errors should be allowed during bootstrap and/or WAL
+ * recovery --- see comments in md.c for details.
+ */
+typedef struct f_smgr
+{
+	void		(*smgr_init) (void);	/* may be NULL */
+	void		(*smgr_shutdown) (void);	/* may be NULL */
+	void		(*smgr_open) (SMgrRelation reln);
+	void		(*smgr_close) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_create) (SMgrRelation reln, ForkNumber forknum,
+								bool isRedo);
+	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
+								bool isRedo);
+	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
+								BlockNumber blocknum, const void *buffer, bool skipFsync);
+	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+									BlockNumber blocknum, int nblocks,
+									bool skipFsync);
+	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
+								  BlockNumber blocknum);
+	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
+							  BlockNumber blocknum, void *buffer);
+	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
+							   BlockNumber blocknum, const void *buffer, bool skipFsync);
+	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
+								   BlockNumber blocknum, BlockNumber nblocks);
+	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
+								  BlockNumber nblocks);
+	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+
+	void		(*smgr_start_unlogged_build) (SMgrRelation reln);
+	void		(*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln);
+	void		(*smgr_end_unlogged_build) (SMgrRelation reln);
+} f_smgr;
+
+typedef void (*smgr_init_hook_type) (void);
+typedef void (*smgr_shutdown_hook_type) (void);
+extern PGDLLIMPORT smgr_init_hook_type smgr_init_hook;
+extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook;
+extern void smgr_init_standard(void);
+extern void smgr_shutdown_standard(void);
+
+// Alternative implementation of calculate_database_size()
+typedef int64 (*dbsize_hook_type) (Oid dbOid);
+extern PGDLLIMPORT dbsize_hook_type dbsize_hook;
+
+typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileLocator rlocator);
+extern PGDLLIMPORT smgr_hook_type smgr_hook;
+extern const f_smgr *smgr_standard(BackendId backend, RelFileLocator rlocator);
+
+extern const f_smgr *smgr(BackendId backend, RelFileLocator rlocator);
+
 extern void smgrinit(void);
-extern SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend);
+extern SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend, char relpersistence);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
 extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln);
 extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln);
@@ -110,4 +183,9 @@ extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void AtEOXact_SMgr(void);
 extern bool ProcessBarrierSmgrRelease(void);
 
+/* Neon: Change relpersistence for unlogged index builds */
+extern void smgr_start_unlogged_build(SMgrRelation reln);
+extern void	smgr_finish_unlogged_build_phase_1(SMgrRelation reln);
+extern void smgr_end_unlogged_build(SMgrRelation reln);
+
 #endif							/* SMGR_H */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 1426a353cd0..c4174da1b1b 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -572,7 +572,7 @@ static inline SMgrRelation
 RelationGetSmgr(Relation rel)
 {
 	if (unlikely(rel->rd_smgr == NULL))
-		smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_locator, rel->rd_backend));
+		smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_locator, rel->rd_backend, rel->rd_rel->relpersistence));
 	return rel->rd_smgr;
 }
 
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 518d3b0a1f7..d1f50cffab0 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -150,7 +150,8 @@ typedef enum
 	WAIT_EVENT_REGISTER_SYNC_REQUEST,
 	WAIT_EVENT_SPIN_DELAY,
 	WAIT_EVENT_VACUUM_DELAY,
-	WAIT_EVENT_VACUUM_TRUNCATE
+	WAIT_EVENT_VACUUM_TRUNCATE,
+	WAIT_EVENT_BACK_PRESSURE
 } WaitEventTimeout;
 
 /* ----------
diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out
index 7cb2f7cc02b..f02f0205427 100644
--- a/src/test/regress/expected/sequence.out
+++ b/src/test/regress/expected/sequence.out
@@ -260,7 +260,7 @@ SELECT nextval('foo_seq_new');
 
 -- log_cnt can be higher if there is a checkpoint just at the right
 -- time, so just test for the expected range
-SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new;
+SELECT last_value, log_cnt IN (0, 31, 32) AS log_cnt_ok, is_called FROM foo_seq_new;
  last_value | log_cnt_ok | is_called 
 ------------+------------+-----------
           2 | t          | t
diff --git a/src/test/regress/expected/spgist.out b/src/test/regress/expected/spgist.out
index 2e911285600..c371e04a795 100644
--- a/src/test/regress/expected/spgist.out
+++ b/src/test/regress/expected/spgist.out
@@ -94,3 +94,6 @@ select box(point(i,j))
   from generate_series(1,100,5) i,
        generate_series(1,10,5) j;
 -- leave this table around, to help in testing dump/restore
+-- NEON: In Neon unlogged tables are wiped away on node restart
+-- so drop the table to keep Neon tests clean.
+drop table spgist_unlogged_tbl;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 001c6e7eb9d..10bf2101f00 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -118,7 +118,9 @@ select name, setting from pg_settings where name like 'enable%';
  enable_hashjoin                | on
  enable_incremental_sort        | on
  enable_indexonlyscan           | on
+ enable_indexonlyscan_prefetch  | on
  enable_indexscan               | on
+ enable_indexscan_prefetch      | on
  enable_material                | on
  enable_memoize                 | on
  enable_mergejoin               | on
@@ -130,9 +132,10 @@ select name, setting from pg_settings where name like 'enable%';
  enable_partitionwise_join      | off
  enable_presorted_aggregate     | on
  enable_seqscan                 | on
+ enable_seqscan_prefetch        | on
  enable_sort                    | on
  enable_tidscan                 | on
-(21 rows)
+(24 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/expected/tablespace_1.out b/src/test/regress/expected/tablespace_1.out
new file mode 100644
index 00000000000..5b9cefa6057
--- /dev/null
+++ b/src/test/regress/expected/tablespace_1.out
@@ -0,0 +1,974 @@
+-- relative tablespace locations are not allowed
+CREATE TABLESPACE regress_tblspace LOCATION 'relative'; -- fail
+ERROR:  tablespace location must be an absolute path
+-- empty tablespace locations are not usually allowed
+CREATE TABLESPACE regress_tblspace LOCATION ''; -- fail
+ERROR:  tablespace location must be an absolute path
+-- as a special developer-only option to allow us to use tablespaces
+-- with streaming replication on the same server, an empty location
+-- can be allowed as a way to say that the tablespace should be created
+-- as a directory in pg_tblspc, rather than being a symlink
+SET allow_in_place_tablespaces = true;
+-- create a tablespace using WITH clause
+CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (some_nonexistent_parameter = true); -- fail
+ERROR:  unrecognized parameter "some_nonexistent_parameter"
+CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (random_page_cost = 3.0); -- ok
+-- check to see the parameter was used
+SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith';
+       spcoptions       
+------------------------
+ {random_page_cost=3.0}
+(1 row)
+
+-- drop the tablespace so we can re-use the location
+DROP TABLESPACE regress_tblspacewith;
+-- This returns a relative path as of an effect of allow_in_place_tablespaces,
+-- masking the tablespace OID used in the path name.
+SELECT regexp_replace(pg_tablespace_location(oid), '(pg_tblspc)/(\d+)', '\1/NNN')
+  FROM pg_tablespace  WHERE spcname = 'regress_tblspace';
+ regexp_replace 
+----------------
+ pg_tblspc/NNN
+(1 row)
+
+-- try setting and resetting some properties for the new tablespace
+ALTER TABLESPACE regress_tblspace SET (random_page_cost = 1.0, seq_page_cost = 1.1);
+ALTER TABLESPACE regress_tblspace SET (some_nonexistent_parameter = true);  -- fail
+ERROR:  unrecognized parameter "some_nonexistent_parameter"
+ALTER TABLESPACE regress_tblspace RESET (random_page_cost = 2.0); -- fail
+ERROR:  RESET must not include values for parameters
+ALTER TABLESPACE regress_tblspace RESET (random_page_cost, effective_io_concurrency); -- ok
+-- REINDEX (TABLESPACE)
+-- catalogs and system tablespaces
+-- system catalog, fail
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_am;
+ERROR:  cannot move system relation "pg_am_name_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_am;
+ERROR:  cannot reindex system catalogs concurrently
+-- shared catalog, fail
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_authid;
+ERROR:  cannot move system relation "pg_authid_rolname_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_authid;
+ERROR:  cannot reindex system catalogs concurrently
+-- toast relations, fail
+REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1260_index;
+ERROR:  cannot move system relation "pg_toast_1260_index"
+REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1260_index;
+ERROR:  cannot reindex system catalogs concurrently
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1260;
+ERROR:  cannot move system relation "pg_toast_1260_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1260;
+ERROR:  cannot reindex system catalogs concurrently
+-- system catalog, fail
+REINDEX (TABLESPACE pg_global) TABLE pg_authid;
+ERROR:  cannot move system relation "pg_authid_rolname_index"
+REINDEX (TABLESPACE pg_global) TABLE CONCURRENTLY pg_authid;
+ERROR:  cannot reindex system catalogs concurrently
+-- table with toast relation
+CREATE TABLE regress_tblspace_test_tbl (num1 bigint, num2 double precision, t text);
+INSERT INTO regress_tblspace_test_tbl (num1, num2, t)
+  SELECT round(random()*100), random(), 'text'
+  FROM generate_series(1, 10) s(i);
+CREATE INDEX regress_tblspace_test_tbl_idx ON regress_tblspace_test_tbl (num1);
+-- move to global tablespace, fail
+REINDEX (TABLESPACE pg_global) INDEX regress_tblspace_test_tbl_idx;
+ERROR:  only shared relations can be placed in pg_global tablespace
+REINDEX (TABLESPACE pg_global) INDEX CONCURRENTLY regress_tblspace_test_tbl_idx;
+ERROR:  cannot move non-shared relation to tablespace "pg_global"
+-- check transactional behavior of REINDEX (TABLESPACE)
+BEGIN;
+REINDEX (TABLESPACE regress_tblspace) INDEX regress_tblspace_test_tbl_idx;
+REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl;
+ROLLBACK;
+-- no relation moved to the new tablespace
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace';
+ relname 
+---------
+(0 rows)
+
+-- check that all indexes are moved to a new tablespace with different
+-- relfilenode.
+-- Save first the existing relfilenode for the toast and main relations.
+SELECT relfilenode as main_filenode FROM pg_class
+  WHERE relname = 'regress_tblspace_test_tbl_idx' \gset
+SELECT relfilenode as toast_filenode FROM pg_class
+  WHERE oid =
+    (SELECT i.indexrelid
+       FROM pg_class c,
+            pg_index i
+       WHERE i.indrelid = c.reltoastrelid AND
+             c.relname = 'regress_tblspace_test_tbl') \gset
+REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE regress_tblspace;
+ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE pg_default;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+-- Move back to the default tablespace.
+ALTER INDEX regress_tblspace_test_tbl_idx SET TABLESPACE pg_default;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+ relname 
+---------
+(0 rows)
+
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE regress_tblspace_test_tbl;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+SELECT relfilenode = :main_filenode AS main_same FROM pg_class
+  WHERE relname = 'regress_tblspace_test_tbl_idx';
+ main_same 
+-----------
+ f
+(1 row)
+
+SELECT relfilenode = :toast_filenode as toast_same FROM pg_class
+  WHERE oid =
+    (SELECT i.indexrelid
+       FROM pg_class c,
+            pg_index i
+       WHERE i.indrelid = c.reltoastrelid AND
+             c.relname = 'regress_tblspace_test_tbl');
+ toast_same 
+------------
+ f
+(1 row)
+
+DROP TABLE regress_tblspace_test_tbl;
+-- REINDEX (TABLESPACE) with partitions
+-- Create a partition tree and check the set of relations reindexed
+-- with their new tablespace.
+CREATE TABLE tbspace_reindex_part (c1 int, c2 int) PARTITION BY RANGE (c1);
+CREATE TABLE tbspace_reindex_part_0 PARTITION OF tbspace_reindex_part
+  FOR VALUES FROM (0) TO (10) PARTITION BY list (c2);
+CREATE TABLE tbspace_reindex_part_0_1 PARTITION OF tbspace_reindex_part_0
+  FOR VALUES IN (1);
+CREATE TABLE tbspace_reindex_part_0_2 PARTITION OF tbspace_reindex_part_0
+  FOR VALUES IN (2);
+-- This partitioned table will have no partitions.
+CREATE TABLE tbspace_reindex_part_10 PARTITION OF tbspace_reindex_part
+   FOR VALUES FROM (10) TO (20) PARTITION BY list (c2);
+-- Create some partitioned indexes
+CREATE INDEX tbspace_reindex_part_index ON ONLY tbspace_reindex_part (c1);
+CREATE INDEX tbspace_reindex_part_index_0 ON ONLY tbspace_reindex_part_0 (c1);
+ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_0;
+-- This partitioned index will have no partitions.
+CREATE INDEX tbspace_reindex_part_index_10 ON ONLY tbspace_reindex_part_10 (c1);
+ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_10;
+CREATE INDEX tbspace_reindex_part_index_0_1 ON ONLY tbspace_reindex_part_0_1 (c1);
+ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_1;
+CREATE INDEX tbspace_reindex_part_index_0_2 ON ONLY tbspace_reindex_part_0_2 (c1);
+ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_2;
+SELECT relid, parentrelid, level FROM pg_partition_tree('tbspace_reindex_part_index')
+  ORDER BY relid, level;
+             relid              |         parentrelid          | level 
+--------------------------------+------------------------------+-------
+ tbspace_reindex_part_index     |                              |     0
+ tbspace_reindex_part_index_0   | tbspace_reindex_part_index   |     1
+ tbspace_reindex_part_index_10  | tbspace_reindex_part_index   |     1
+ tbspace_reindex_part_index_0_1 | tbspace_reindex_part_index_0 |     2
+ tbspace_reindex_part_index_0_2 | tbspace_reindex_part_index_0 |     2
+(5 rows)
+
+-- Track the original tablespace, relfilenode and OID of each index
+-- in the tree.
+CREATE TEMP TABLE reindex_temp_before AS
+  SELECT oid, relname, relfilenode, reltablespace
+  FROM pg_class
+    WHERE relname ~ 'tbspace_reindex_part_index';
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tbspace_reindex_part;
+-- REINDEX CONCURRENTLY changes the OID of the old relation, hence a check
+-- based on the relation name below.
+SELECT b.relname,
+       CASE WHEN a.relfilenode = b.relfilenode THEN 'relfilenode is unchanged'
+       ELSE 'relfilenode has changed' END AS filenode,
+       CASE WHEN a.reltablespace = b.reltablespace THEN 'reltablespace is unchanged'
+       ELSE 'reltablespace has changed' END AS tbspace
+  FROM reindex_temp_before b JOIN pg_class a ON b.relname = a.relname
+  ORDER BY 1;
+            relname             |         filenode         |          tbspace           
+--------------------------------+--------------------------+----------------------------
+ tbspace_reindex_part_index     | relfilenode is unchanged | reltablespace is unchanged
+ tbspace_reindex_part_index_0   | relfilenode is unchanged | reltablespace is unchanged
+ tbspace_reindex_part_index_0_1 | relfilenode has changed  | reltablespace has changed
+ tbspace_reindex_part_index_0_2 | relfilenode has changed  | reltablespace has changed
+ tbspace_reindex_part_index_10  | relfilenode is unchanged | reltablespace is unchanged
+(5 rows)
+
+DROP TABLE tbspace_reindex_part;
+-- create a schema we can use
+CREATE SCHEMA testschema;
+-- try a table
+CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'foo';
+ relname |     spcname      
+---------+------------------
+ foo     | regress_tblspace
+(1 row)
+
+INSERT INTO testschema.foo VALUES(1);
+INSERT INTO testschema.foo VALUES(2);
+-- tables from dynamic sources
+CREATE TABLE testschema.asselect TABLESPACE regress_tblspace AS SELECT 1;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'asselect';
+ relname  |     spcname      
+----------+------------------
+ asselect | regress_tblspace
+(1 row)
+
+PREPARE selectsource(int) AS SELECT $1;
+CREATE TABLE testschema.asexecute TABLESPACE regress_tblspace
+    AS EXECUTE selectsource(2);
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'asexecute';
+  relname  |     spcname      
+-----------+------------------
+ asexecute | regress_tblspace
+(1 row)
+
+-- index
+CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'foo_idx';
+ relname |     spcname      
+---------+------------------
+ foo_idx | regress_tblspace
+(1 row)
+
+-- check \d output
+\d testschema.foo
+              Table "testschema.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ i      | integer |           |          | 
+Indexes:
+    "foo_idx" btree (i), tablespace "regress_tblspace"
+Tablespace: "regress_tblspace"
+
+\d testschema.foo_idx
+      Index "testschema.foo_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ i      | integer | yes  | i
+btree, for table "testschema.foo"
+Tablespace: "regress_tblspace"
+
+--
+-- partitioned table
+--
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+ERROR:  only shared relations can be placed in pg_global tablespace
+RESET default_tablespace;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+SET default_tablespace TO regress_tblspace;
+CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2);
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+ERROR:  only shared relations can be placed in pg_global tablespace
+ALTER TABLE testschema.part SET TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4)
+  TABLESPACE pg_default;
+CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6)
+  PARTITION BY LIST (a);
+ALTER TABLE testschema.part SET TABLESPACE pg_default;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+ERROR:  only shared relations can be placed in pg_global tablespace
+CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10)
+  PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+RESET default_tablespace;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+SELECT relname, spcname FROM pg_catalog.pg_class c
+    JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid)
+    LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid
+    where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname;
+ relname  |     spcname      
+----------+------------------
+ part     | 
+ part_1   | 
+ part_2   | regress_tblspace
+ part_3   | regress_tblspace
+ part_4   | 
+ part_56  | regress_tblspace
+ part_78  | 
+ part_910 | regress_tblspace
+(8 rows)
+
+RESET default_tablespace;
+DROP TABLE testschema.part;
+-- partitioned index
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1);
+CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2);
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx' ORDER BY relname;
+   relname   |     spcname      
+-------------+------------------
+ part1_a_idx | regress_tblspace
+ part2_a_idx | regress_tblspace
+ part_a_idx  | regress_tblspace
+(3 rows)
+
+\d testschema.part
+        Partitioned table "testschema.part"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition key: LIST (a)
+Indexes:
+    "part_a_idx" btree (a), tablespace "regress_tblspace"
+Number of partitions: 2 (Use \d+ to list them.)
+
+\d+ testschema.part
+                           Partitioned table "testschema.part"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition key: LIST (a)
+Indexes:
+    "part_a_idx" btree (a), tablespace "regress_tblspace"
+Partitions: testschema.part1 FOR VALUES IN (1),
+            testschema.part2 FOR VALUES IN (2)
+
+\d testschema.part1
+             Table "testschema.part1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition of: testschema.part FOR VALUES IN (1)
+Indexes:
+    "part1_a_idx" btree (a), tablespace "regress_tblspace"
+
+\d+ testschema.part1
+                                 Table "testschema.part1"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition of: testschema.part FOR VALUES IN (1)
+Partition constraint: ((a IS NOT NULL) AND (a = 1))
+Indexes:
+    "part1_a_idx" btree (a), tablespace "regress_tblspace"
+
+\d testschema.part_a_idx
+Partitioned index "testschema.part_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.part"
+Number of partitions: 2 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d+ testschema.part_a_idx
+           Partitioned index "testschema.part_a_idx"
+ Column |  Type   | Key? | Definition | Storage | Stats target 
+--------+---------+------+------------+---------+--------------
+ a      | integer | yes  | a          | plain   | 
+btree, for table "testschema.part"
+Partitions: testschema.part1_a_idx,
+            testschema.part2_a_idx
+Tablespace: "regress_tblspace"
+
+-- partitioned rels cannot specify the default tablespace.  These fail:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default;
+ERROR:  cannot specify default tablespace for partitioned relations
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a);
+ERROR:  cannot specify default tablespace for partitioned relations
+SET default_tablespace TO 'pg_default';
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+ERROR:  cannot specify default tablespace for partitioned relations
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a);
+ERROR:  cannot specify default tablespace for partitioned relations
+-- but these work:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+SET default_tablespace TO '';
+CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a);
+DROP TABLE testschema.dflt, testschema.dflt2;
+-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
+CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
+INSERT INTO testschema.test_default_tab VALUES (1);
+CREATE INDEX test_index1 on testschema.test_default_tab (id);
+CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+-- use a custom tablespace for default_tablespace
+SET default_tablespace TO regress_tblspace;
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab;
+ id 
+----
+  1
+(1 row)
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
+\d testschema.test_index1
+    Index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+    Index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+    Index "testschema.test_index3"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+    Index "testschema.test_index4"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab;
+ id 
+----
+  1
+(1 row)
+
+-- now use the default tablespace for default_tablespace
+SET default_tablespace TO '';
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
+\d testschema.test_index1
+    Index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+    Index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+    Index "testschema.test_index3"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+    Index "testschema.test_index4"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+DROP TABLE testschema.test_default_tab;
+-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
+-- (this time with a partitioned table)
+CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint)
+    PARTITION BY LIST (id) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p
+    FOR VALUES IN (1);
+INSERT INTO testschema.test_default_tab_p VALUES (1);
+CREATE INDEX test_index1 on testschema.test_default_tab_p (val);
+CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+-- use a custom tablespace for default_tablespace
+SET default_tablespace TO regress_tblspace;
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab_p;
+ id | val 
+----+-----
+  1 |    
+(1 row)
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab_p;
+ id | val 
+----+-----
+  1 |    
+(1 row)
+
+-- now use the default tablespace for default_tablespace
+SET default_tablespace TO '';
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+DROP TABLE testschema.test_default_tab_p;
+-- check that default_tablespace affects index additions in ALTER TABLE
+CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace;
+INSERT INTO testschema.test_tab VALUES (1);
+SET default_tablespace TO regress_tblspace;
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (id);
+SET default_tablespace TO '';
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_pkey
+   Index "testschema.test_tab_pkey"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_tab"
+
+SELECT * FROM testschema.test_tab;
+ id 
+----
+  1
+(1 row)
+
+DROP TABLE testschema.test_tab;
+-- check that default_tablespace is handled correctly by multi-command
+-- ALTER TABLE that includes a tablespace-preserving rewrite
+CREATE TABLE testschema.test_tab(a int, b int, c int);
+SET default_tablespace TO regress_tblspace;
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a);
+CREATE INDEX test_tab_a_idx ON testschema.test_tab (a);
+SET default_tablespace TO '';
+CREATE INDEX test_tab_b_idx ON testschema.test_tab (b);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_a_idx
+  Index "testschema.test_tab_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_b_idx
+  Index "testschema.test_tab_b_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ b      | integer | yes  | b
+btree, for table "testschema.test_tab"
+
+ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_a_idx
+  Index "testschema.test_tab_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_b_idx
+  Index "testschema.test_tab_b_idx"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ b      | bigint | yes  | b
+btree, for table "testschema.test_tab"
+
+DROP TABLE testschema.test_tab;
+-- let's try moving a table from one place to another
+CREATE TABLE testschema.atable AS VALUES (1), (2);
+CREATE UNIQUE INDEX anindex ON testschema.atable(column1);
+ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global;
+ERROR:  only shared relations can be placed in pg_global tablespace
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace;
+INSERT INTO testschema.atable VALUES(3);	-- ok
+INSERT INTO testschema.atable VALUES(1);	-- fail (checks index)
+ERROR:  duplicate key value violates unique constraint "anindex"
+DETAIL:  Key (column1)=(1) already exists.
+SELECT COUNT(*) FROM testschema.atable;		-- checks heap
+ count 
+-------
+     3
+(1 row)
+
+-- let's try moving a materialized view from one place to another
+CREATE MATERIALIZED VIEW testschema.amv AS SELECT * FROM testschema.atable;
+ALTER MATERIALIZED VIEW testschema.amv SET TABLESPACE regress_tblspace;
+REFRESH MATERIALIZED VIEW testschema.amv;
+SELECT COUNT(*) FROM testschema.amv;
+ count 
+-------
+     3
+(1 row)
+
+-- Will fail with bad path
+CREATE TABLESPACE regress_badspace LOCATION '/no/such/location';
+ERROR:  directory "/no/such/location" does not exist
+-- No such tablespace
+CREATE TABLE bar (i int) TABLESPACE regress_nosuchspace;
+ERROR:  tablespace "regress_nosuchspace" does not exist
+-- Fail, in use for some partitioned object
+DROP TABLESPACE regress_tblspace;
+ERROR:  tablespace "regress_tblspace" cannot be dropped because some objects depend on it
+DETAIL:  tablespace for index testschema.part_a_idx
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
+-- Fail, not empty
+DROP TABLESPACE regress_tblspace;
+CREATE ROLE regress_tablespace_user1 login;
+CREATE ROLE regress_tablespace_user2 login;
+GRANT USAGE ON SCHEMA testschema TO regress_tablespace_user2;
+ALTER TABLESPACE regress_tblspace OWNER TO regress_tablespace_user1;
+ERROR:  tablespace "regress_tblspace" does not exist
+CREATE TABLE testschema.tablespace_acl (c int);
+-- new owner lacks permission to create this index from scratch
+CREATE INDEX k ON testschema.tablespace_acl (c) TABLESPACE regress_tblspace;
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE testschema.tablespace_acl OWNER TO regress_tablespace_user2;
+SET SESSION ROLE regress_tablespace_user2;
+CREATE TABLE tablespace_table (i int) TABLESPACE regress_tblspace; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE testschema.tablespace_acl ALTER c TYPE bigint;
+REINDEX (TABLESPACE regress_tblspace) TABLE tablespace_table; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tablespace_table; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+RESET ROLE;
+ALTER TABLESPACE regress_tblspace RENAME TO regress_tblspace_renamed;
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+ALTER INDEX ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+-- Should show notice that nothing was done
+ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+-- Should succeed
+DROP TABLESPACE regress_tblspace_renamed;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+DROP SCHEMA testschema CASCADE;
+NOTICE:  drop cascades to 7 other objects
+DETAIL:  drop cascades to table testschema.foo
+drop cascades to table testschema.asselect
+drop cascades to table testschema.asexecute
+drop cascades to table testschema.part
+drop cascades to table testschema.atable
+drop cascades to materialized view testschema.amv
+drop cascades to table testschema.tablespace_acl
+DROP ROLE regress_tablespace_user1;
+DROP ROLE regress_tablespace_user2;
diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql
index 674f5f1f668..793f1415f6b 100644
--- a/src/test/regress/sql/sequence.sql
+++ b/src/test/regress/sql/sequence.sql
@@ -137,7 +137,7 @@ SELECT nextval('foo_seq_new');
 SELECT nextval('foo_seq_new');
 -- log_cnt can be higher if there is a checkpoint just at the right
 -- time, so just test for the expected range
-SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new;
+SELECT last_value, log_cnt IN (0, 31, 32) AS log_cnt_ok, is_called FROM foo_seq_new;
 DROP SEQUENCE foo_seq_new;
 
 -- renaming serial sequences
diff --git a/src/test/regress/sql/spgist.sql b/src/test/regress/sql/spgist.sql
index 4828ede68c3..9d6394516a2 100644
--- a/src/test/regress/sql/spgist.sql
+++ b/src/test/regress/sql/spgist.sql
@@ -89,3 +89,6 @@ select box(point(i,j))
   from generate_series(1,100,5) i,
        generate_series(1,10,5) j;
 -- leave this table around, to help in testing dump/restore
+-- NEON: In Neon unlogged tables are wiped away on node restart
+-- so drop the table to keep Neon tests clean.
+drop table spgist_unlogged_tbl;

From ed56101d5be891786229ecd27be6b6bc620cd9ec Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 12 Sep 2023 13:25:53 +0300
Subject: [PATCH 02/13] Fix WAL-logging XLOG_NEON_HEAP_LOCK records.

---
 src/backend/access/heap/heapam.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 813c199f300..fa3e5e96cb1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -4937,7 +4937,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple,
 											  tuple->t_data->t_infomask2);
 		xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
 		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data);
-		XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
+		XLogRegisterData((char *) &xlrec, SizeOfNeonHeapLock);
 
 		/* we don't decode row locks atm, so no need to log the origin */
 

From bc5f0014323a59b1a88d62212bd3377881ce5d34 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 25 Sep 2023 22:47:28 +0300
Subject: [PATCH 03/13] Upadte last written LSN after walloging all createdb
 stuff (#308)

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/backend/commands/dbcommands.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 89f9a7a6250..4e9bfcfcb0c 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -487,8 +487,6 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo)
 
 		lsn = XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE_WAL_LOG);
 
-		SetLastWrittenLSNForDatabase(lsn);
-
 		/* As always, WAL must hit the disk before the data update does. */
 		XLogFlush(lsn);
 	}
@@ -616,7 +614,6 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 		/* Record the filesystem change in XLOG */
 		{
 			xl_dbase_create_file_copy_rec xlrec;
-			XLogRecPtr lsn;
 
 			xlrec.db_id = dst_dboid;
 			xlrec.tablespace_id = dsttablespace;
@@ -627,10 +624,8 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 			XLogRegisterData((char *) &xlrec,
 							 sizeof(xl_dbase_create_file_copy_rec));
 
-			lsn = XLogInsert(RM_DBASE_ID,
-							 XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE);
-
-			SetLastWrittenLSNForDatabase(lsn);
+			(void) XLogInsert(RM_DBASE_ID,
+							  XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE);
 		}
 		pfree(srcpath);
 		pfree(dstpath);
@@ -1466,6 +1461,11 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace,
 										dst_deftablespace);
 
+		/*
+		 * Update global last written LSN after wal-logging create database command
+		 */
+		SetLastWrittenLSNForDatabase(XactLastRecEnd);
+
 		/*
 		 * Close pg_database, but keep lock till commit.
 		 */

From 17bc6506eae35d5dea199654ad4e0ca597324e9d Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 18 Oct 2023 15:31:58 +0300
Subject: [PATCH 04/13] Neon logical replication support for PG16 (#310)

* Neon logical replication support for PG16

* Log heap rewrite file after creation.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 src/backend/access/heap/rewriteheap.c       | 39 ++++++++++++++++++++-
 src/backend/access/transam/xlog.c           | 35 +++++++++++++++---
 src/backend/replication/logical/origin.c    | 19 ++++++++++
 src/backend/replication/logical/snapbuild.c | 11 ++++++
 src/backend/replication/slot.c              | 19 ++++++++++
 5 files changed, 117 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 424958912c7..d214f30bad1 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -117,6 +117,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "replication/logical.h"
+#include "replication/message.h"
 #include "replication/slot.h"
 #include "storage/bufmgr.h"
 #include "storage/fd.h"
@@ -784,6 +785,36 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
  * ------------------------------------------------------------------------
  */
 
+/*
+ * NEON: we need to persist mapping file in WAL
+ */
+static void
+wallog_mapping_file(char const* path, int fd)
+{
+	char	prefix[MAXPGPATH];
+	snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+	if (fd < 0)
+	{
+		elog(DEBUG1, "neon: deleting contents of rewrite file %s", path);
+		/* unlink file */
+		LogLogicalMessage(prefix, NULL, 0, false);
+	}
+	else
+	{
+		off_t size = lseek(fd, 0, SEEK_END);
+		char* buf;
+		elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, size);
+		if (size < 0)
+			elog(ERROR, "Failed to get size of mapping file: %m");
+		buf = palloc((size_t)size);
+		lseek(fd, 0, SEEK_SET);
+		if (read(fd, buf, (size_t)size) != size)
+			elog(ERROR, "Failed to read mapping file: %m");
+		LogLogicalMessage(prefix, buf, (size_t)size, false);
+		pfree(buf);
+	}
+}
+
 /*
  * Do preparations for logging logical mappings during a rewrite if
  * necessary. If we detect that we don't need to log anything we'll prevent
@@ -919,6 +950,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
 					 errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path,
 							written, len)));
 		src->off += len;
+		wallog_mapping_file(src->path, FileGetRawDesc(src->vfd));
 
 		XLogBeginInsert();
 		XLogRegisterData((char *) (&xlrec), sizeof(xlrec));
@@ -1004,7 +1036,7 @@ logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
 		src->off = 0;
 		memcpy(src->path, path, sizeof(path));
 		src->vfd = PathNameOpenFile(path,
-									O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
+									O_CREAT | O_EXCL | O_RDWR | PG_BINARY);
 		if (src->vfd < 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -1169,6 +1201,8 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
 				 errmsg("could not fsync file \"%s\": %m", path)));
 	pgstat_report_wait_end();
 
+	wallog_mapping_file(path, fd);
+
 	if (CloseTransientFile(fd) != 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -1246,6 +1280,7 @@ CheckPointLogicalRewriteHeap(void)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m", path)));
+			wallog_mapping_file(path, -1);
 		}
 		else
 		{
@@ -1274,6 +1309,8 @@ CheckPointLogicalRewriteHeap(void)
 						 errmsg("could not fsync file \"%s\": %m", path)));
 			pgstat_report_wait_end();
 
+			wallog_mapping_file(path, fd);
+
 			if (CloseTransientFile(fd) != 0)
 				ereport(ERROR,
 						(errcode_for_file_access(),
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f25fe14118d..08de7356e6a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -699,6 +699,7 @@ static void CreateEndOfRecoveryRecord(void);
 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
 												  XLogRecPtr pagePtr,
 												  TimeLineID newTLI);
+static void PreCheckPointGuts(int flags);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
@@ -6845,6 +6846,11 @@ CreateCheckPoint(int flags)
 	 */
 	SyncPreCheckpoint();
 
+	/*
+	 * NEON: perform checkpiont action requiring write to the WAL before we determine the REDO pointer.
+	 */
+	PreCheckPointGuts(flags);
+
 	/*
 	 * Use a critical section to force system panic if we have trouble.
 	 */
@@ -7354,6 +7360,28 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
 	return recptr;
 }
 
+static void
+CheckPointReplicationState(void)
+{
+	CheckPointRelationMap();
+	CheckPointReplicationSlots();
+	CheckPointSnapBuild();
+	CheckPointLogicalRewriteHeap();
+	CheckPointReplicationOrigin();
+}
+
+/*
+ * NEON:  we use logical records to persist information of about slots, origins, relation map...
+ * If it is done inside shutdown checkpoint, then Postgres panics: "concurrent write-ahead log activity while database system is shutting down"
+ * So it before checkpoint REDO position is determined.
+ */
+static void
+PreCheckPointGuts(int flags)
+{
+	if (flags & CHECKPOINT_IS_SHUTDOWN)
+		CheckPointReplicationState();
+}
+
 /*
  * Flush all data in shared memory to disk, and fsync
  *
@@ -7363,11 +7391,8 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
 static void
 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
-	CheckPointRelationMap();
-	CheckPointReplicationSlots();
-	CheckPointSnapBuild();
-	CheckPointLogicalRewriteHeap();
-	CheckPointReplicationOrigin();
+	if (!(flags & CHECKPOINT_IS_SHUTDOWN))
+		CheckPointReplicationState();
 
 	/* Write out all dirty data in SLRUs and the main buffer pool */
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c
index b0255ffd25a..c24f93b08c2 100644
--- a/src/backend/replication/logical/origin.c
+++ b/src/backend/replication/logical/origin.c
@@ -83,6 +83,7 @@
 #include "nodes/execnodes.h"
 #include "pgstat.h"
 #include "replication/logical.h"
+#include "replication/message.h"
 #include "replication/origin.h"
 #include "storage/condition_variable.h"
 #include "storage/copydir.h"
@@ -578,10 +579,14 @@ CheckPointReplicationOrigin(void)
 	int			i;
 	uint32		magic = REPLICATION_STATE_MAGIC;
 	pg_crc32c	crc;
+	char	   *buf;
+	size_t      chkp_size;
 
 	if (max_replication_slots == 0)
 		return;
 
+	buf = palloc(sizeof(magic) + max_replication_slots*sizeof(ReplicationStateOnDisk) + sizeof(crc));
+
 	INIT_CRC32C(crc);
 
 	/* make sure no old temp file is remaining */
@@ -615,6 +620,9 @@ CheckPointReplicationOrigin(void)
 				 errmsg("could not write to file \"%s\": %m",
 						tmppath)));
 	}
+	memcpy(buf, &magic, sizeof magic);
+	chkp_size = sizeof(magic);
+
 	COMP_CRC32C(crc, &magic, sizeof(magic));
 
 	/* prevent concurrent creations/drops */
@@ -657,6 +665,8 @@ CheckPointReplicationOrigin(void)
 					 errmsg("could not write to file \"%s\": %m",
 							tmppath)));
 		}
+		memcpy(buf + chkp_size, &disk_state, sizeof(disk_state));
+		chkp_size += sizeof(disk_state);
 
 		COMP_CRC32C(crc, &disk_state, sizeof(disk_state));
 	}
@@ -676,6 +686,15 @@ CheckPointReplicationOrigin(void)
 				 errmsg("could not write to file \"%s\": %m",
 						tmppath)));
 	}
+	if (chkp_size != sizeof(magic)) /* has some valid origins */
+	{
+		memcpy(buf + chkp_size, &crc, sizeof crc);
+		chkp_size += sizeof(crc);
+
+		/* NEON specific: persist snapshot in storage using logical message */
+		LogLogicalMessage("neon-file:pg_logical/replorigin_checkpoint", buf, chkp_size, false);
+	}
+	pfree(buf);
 
 	if (CloseTransientFile(tmpfd) != 0)
 		ereport(PANIC,
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 7a7aba33e16..488e24b2b75 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -127,6 +127,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "replication/logical.h"
+#include "replication/message.h"
 #include "replication/reorderbuffer.h"
 #include "replication/snapbuild.h"
 #include "storage/block.h"		/* debugging output */
@@ -1609,6 +1610,7 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
 	int			fd;
 	char		tmppath[MAXPGPATH];
 	char		path[MAXPGPATH];
+	char		prefix[MAXPGPATH];
 	int			ret;
 	struct stat stat_buf;
 	Size		sz;
@@ -1752,6 +1754,10 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
 				(errcode_for_file_access(),
 				 errmsg("could not open file \"%s\": %m", tmppath)));
 
+	/* NEON specific: persist snapshot in storage using logical message */
+	snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+	LogLogicalMessage(prefix, (char*)ondisk, needed_length, false);
+
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE);
 	if ((write(fd, ondisk, needed_length)) != needed_length)
@@ -2054,6 +2060,7 @@ CheckPointSnapBuild(void)
 	DIR		   *snap_dir;
 	struct dirent *snap_de;
 	char		path[MAXPGPATH + 21];
+	char		prefix[MAXPGPATH + 31];
 
 	/*
 	 * We start off with a minimum of the last redo pointer. No new
@@ -2113,6 +2120,10 @@ CheckPointSnapBuild(void)
 		{
 			elog(DEBUG1, "removing snapbuild snapshot %s", path);
 
+			/* NEON specific: delete file from storage using logical message */
+			snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+			LogLogicalMessage(prefix, NULL, 0, false);
+
 			/*
 			 * It's not particularly harmful, though strange, if we can't
 			 * remove the file here. Don't prevent the checkpoint from
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index bb09c4010f8..e3424957303 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -47,6 +47,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "replication/slot.h"
+#include "replication/message.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
@@ -685,6 +686,15 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
 	sprintf(path, "pg_replslot/%s", NameStr(slot->data.name));
 	sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name));
 
+	if (SlotIsLogical(slot))
+	{
+		/* NEON specific: delete slot from storage using logical message */
+		char		prefix[MAXPGPATH];
+		snprintf(prefix, sizeof(prefix), "neon-file:%s/state", path);
+		elog(LOG, "Drop replication slot %s", path);
+		LogLogicalMessage(prefix, NULL, 0, false);
+	}
+
 	/*
 	 * Rename the slot directory on disk, so that we'll no longer recognize
 	 * this as a valid slot.  Note that if this fails, we've got to mark the
@@ -1795,6 +1805,15 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 				ReplicationSlotOnDiskChecksummedSize);
 	FIN_CRC32C(cp.checksum);
 
+	if (SlotIsLogical(slot) && cp.slotdata.restart_lsn != InvalidXLogRecPtr)
+	{
+		/* NEON specific: persist slot in storage using logical message */
+		char		prefix[MAXPGPATH];
+		snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
+		elog(LOG, "Save replication slot at %s restart_lsn=%X/%X", path, 	LSN_FORMAT_ARGS(cp.slotdata.restart_lsn));
+		LogLogicalMessage(prefix, (char*)&cp, sizeof cp, false);
+	}
+
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE);
 	if ((write(fd, &cp, sizeof(cp))) != sizeof(cp))

From 7f8a2cbf050b1c8073dda94308cea4ca598b0e99 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 19 Oct 2023 14:57:41 +0300
Subject: [PATCH 05/13] Fix elog format error in wallog_mapping_file (#317)

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/backend/access/heap/rewriteheap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index d214f30bad1..15cd48850b0 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -803,7 +803,7 @@ wallog_mapping_file(char const* path, int fd)
 	{
 		off_t size = lseek(fd, 0, SEEK_END);
 		char* buf;
-		elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, size);
+		elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, (long)size);
 		if (size < 0)
 			elog(ERROR, "Failed to get size of mapping file: %m");
 		buf = palloc((size_t)size);

From 812c5a206a7acabb981b7415aad8c473284f97bc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 3 Nov 2023 15:59:23 +0200
Subject: [PATCH 06/13] Update WAL buffers when restoring WAL at compute needed
 for LR (#323)

* Update WAL buffers when restoring WAL at compute needed for LR

* Fix copying data in WAL buffers

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/backend/access/transam/xlog.c | 26 ++++++++++++++++++++++++++
 src/include/access/xlogrecovery.h |  2 ++
 2 files changed, 28 insertions(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 08de7356e6a..0a6226892c9 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9366,3 +9366,29 @@ SetWalWriterSleeping(bool sleeping)
 	XLogCtl->WalWriterSleeping = sleeping;
 	SpinLockRelease(&XLogCtl->info_lck);
 }
+
+void
+XLogUpdateWalBuffers(char* data, XLogRecPtr start, size_t len)
+{
+	XLogRecPtr end;
+	int idx;
+	XLogRecPtr pagebegptr;
+
+	LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
+
+	end = start + len;
+	idx = XLogRecPtrToBufIdx(end);
+	pagebegptr = XLogCtl->xlblocks[idx] - XLOG_BLCKSZ;
+
+	if (pagebegptr + XLOG_BLCKSZ >= end && pagebegptr < end)
+	{
+		/* Last page of the segment is present in WAL buffers */
+		char* page = &XLogCtl->pages[idx * XLOG_BLCKSZ];
+		size_t overlap = end - pagebegptr;
+		if (overlap <= len)
+			memcpy(page, data + len - overlap, overlap);
+		else
+			memcpy(page + overlap - len, data, len);
+	}
+	LWLockRelease(WALBufMappingLock);
+}
diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h
index 23080f8e087..becbf4a3735 100644
--- a/src/include/access/xlogrecovery.h
+++ b/src/include/access/xlogrecovery.h
@@ -156,4 +156,6 @@ extern void RecoveryRequiresIntParameter(const char *param_name, int currValue,
 
 extern void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 
+extern void XLogUpdateWalBuffers(char* data, XLogRecPtr start, size_t len);
+
 #endif							/* XLOGRECOVERY_H */

From 863b71572bc441581efb3bbee2ad18af037be1bb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 24 Nov 2023 09:39:18 +0200
Subject: [PATCH 07/13] Optimize stroing zero FPI in WAL (#327)

PG16 adds new function to SMGR: zeroextend
It's implementation in Neon actually wal-log zero pages of extended relation.
This zero page is wal-logged using XLOG_FPI.
As far as page is zero, the hole optimization (excluding from the image everything between pg_upper and pd_lower) doesn't work.

This PR allows to set hole size to BLCKSZ in case of zero page (PageIsNull() returns true).
---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 src/backend/access/transam/xloginsert.c | 37 +++++++++++++++++--------
 src/backend/access/transam/xlogreader.c |  4 ++-
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a8901e1402f..1fc2159b193 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -656,22 +656,30 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
 			 */
 			if (regbuf->flags & REGBUF_STANDARD)
 			{
-				/* Assume we can omit data between pd_lower and pd_upper */
-				uint16		lower = ((PageHeader) page)->pd_lower;
-				uint16		upper = ((PageHeader) page)->pd_upper;
-
-				if (lower >= SizeOfPageHeaderData &&
-					upper > lower &&
-					upper <= BLCKSZ)
+				if (PageIsNew(page))
 				{
-					bimg.hole_offset = lower;
-					cbimg.hole_length = upper - lower;
+					bimg.hole_offset = 0;
+					cbimg.hole_length = BLCKSZ;
 				}
 				else
 				{
-					/* No "hole" to remove */
-					bimg.hole_offset = 0;
-					cbimg.hole_length = 0;
+					/* Assume we can omit data between pd_lower and pd_upper */
+					uint16		lower = ((PageHeader) page)->pd_lower;
+					uint16		upper = ((PageHeader) page)->pd_upper;
+
+					if (lower >= SizeOfPageHeaderData &&
+						upper > lower &&
+						upper <= BLCKSZ)
+					{
+						bimg.hole_offset = lower;
+						cbimg.hole_length = upper - lower;
+					}
+					else
+					{
+						/* No "hole" to remove */
+						bimg.hole_offset = 0;
+						cbimg.hole_length = 0;
+					}
 				}
 			}
 			else
@@ -765,6 +773,11 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
 					rdt_datas_last->data = page;
 					rdt_datas_last->len = BLCKSZ;
 				}
+				else if (bimg.length == 0)
+				{
+					rdt_datas_last->data = page;
+					rdt_datas_last->len = 0;
+				}
 				else
 				{
 					/* must skip the hole */
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 5f46b007acc..128041d0f40 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1840,7 +1840,9 @@ DecodeXLogRecord(XLogReaderState *state,
 				if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) &&
 					(blk->hole_offset == 0 ||
 					 blk->hole_length == 0 ||
-					 blk->bimg_len == BLCKSZ))
+					 blk->bimg_len == BLCKSZ) &&
+					!(blk->hole_offset == 0 &&
+					  blk->hole_length == BLCKSZ)) /* null page */
 				{
 					report_invalid_record(state,
 										  "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",

From 1127d0a4ad454a8b7cbbd7717ef348ace407a85e Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 14 Dec 2023 08:39:19 -0800
Subject: [PATCH 08/13] Prevent output callbacks from hearing about neon-file
 messages (#328)

* Prevent output callbacks from hearing about neon-file messages
---
 src/backend/replication/logical/logical.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 41243d0187a..53563b85c8f 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -1236,6 +1236,8 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 
 	if (ctx->callbacks.message_cb == NULL)
 		return;
+	if (strcmp(prefix, "neon-file") == 0)
+		return;
 
 	/* Push callback + info on the error context stack */
 	state.ctx = ctx;
@@ -1551,6 +1553,8 @@ stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 	/* this callback is optional */
 	if (ctx->callbacks.stream_message_cb == NULL)
 		return;
+	if (strcmp(prefix, "neon-file")) == 0)
+		return;
 
 	/* Push callback + info on the error context stack */
 	state.ctx = ctx;

From de8242c400f7870084861ac5796e0b5088b1898d Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Fri, 15 Dec 2023 09:36:35 -0800
Subject: [PATCH 09/13] Use strncmp instead of strcmp

---
 src/backend/replication/logical/logical.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 53563b85c8f..8f56bc804ef 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -1236,7 +1236,7 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 
 	if (ctx->callbacks.message_cb == NULL)
 		return;
-	if (strcmp(prefix, "neon-file") == 0)
+	if (strncmp(prefix, "neon-file", strlen("neon-file")) == 0)
 		return;
 
 	/* Push callback + info on the error context stack */
@@ -1553,7 +1553,7 @@ stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 	/* this callback is optional */
 	if (ctx->callbacks.stream_message_cb == NULL)
 		return;
-	if (strcmp(prefix, "neon-file")) == 0)
+	if (strncmp(prefix, "neon-file", strlen("neon-file")) == 0)
 		return;
 
 	/* Push callback + info on the error context stack */

From 225071f482774943854c2eec4540757e01171557 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 20 Dec 2023 20:51:25 +0300
Subject: [PATCH 10/13] Flush slot creation/drop.

---
 src/backend/replication/slot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index e3424957303..7934d51f3c6 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -692,7 +692,7 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
 		char		prefix[MAXPGPATH];
 		snprintf(prefix, sizeof(prefix), "neon-file:%s/state", path);
 		elog(LOG, "Drop replication slot %s", path);
-		LogLogicalMessage(prefix, NULL, 0, false);
+		XLogFlush(LogLogicalMessage(prefix, NULL, 0, false));
 	}
 
 	/*
@@ -1811,7 +1811,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 		char		prefix[MAXPGPATH];
 		snprintf(prefix, sizeof(prefix), "neon-file:%s", path);
 		elog(LOG, "Save replication slot at %s restart_lsn=%X/%X", path, 	LSN_FORMAT_ARGS(cp.slotdata.restart_lsn));
-		LogLogicalMessage(prefix, (char*)&cp, sizeof cp, false);
+		XLogFlush(LogLogicalMessage(prefix, (char*)&cp, sizeof cp, false));
 	}
 
 	errno = 0;

From 73eda3d05b8d4761525965c6a1ce73cc13a985d4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 19 Oct 2023 12:11:00 +0300
Subject: [PATCH 11/13] Remove excessive walsender reply logging.

---
 src/backend/replication/walsender.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 4ecfbd4a002..ebb9ddc9b9b 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -2087,13 +2087,6 @@ ProcessStandbyReplyMessage(void)
 						applyPtr,
 						replyTime,
 						replyRequested);
-
-	elog(LOG, "ProcessStandbyReplyMessage: writelsn %X/%X",
-					LSN_FORMAT_ARGS(writePtr));
-	elog(LOG, "ProcessStandbyReplyMessage: flushlsn %X/%X",
-					LSN_FORMAT_ARGS(flushPtr));
-	elog(LOG, "ProcessStandbyReplyMessage: applylsn %X/%X",
-					LSN_FORMAT_ARGS(applyPtr));
 }
 
 void

From 7be4a52d728459b79b59343c57d338c3073059c8 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 8 Jan 2024 13:48:10 -0800
Subject: [PATCH 12/13] Allow publications FOR ALL TABLES to neon_superuser

---
 src/backend/commands/publicationcmds.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c
index f4ba572697a..464edf166cd 100644
--- a/src/backend/commands/publicationcmds.c
+++ b/src/backend/commands/publicationcmds.c
@@ -728,6 +728,13 @@ CheckPubRelationColumnList(char *pubname, List *tables,
 	}
 }
 
+static bool
+is_neon_superuser(void)
+{
+	Oid neon_superuser_oid = get_role_oid("neon_superuser", true /*missing_ok*/);
+	return neon_superuser_oid != InvalidOid && has_privs_of_role(GetCurrentRoleId(), neon_superuser_oid);
+}
+
 /*
  * Create new publication.
  */
@@ -755,7 +762,7 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt)
 					   get_database_name(MyDatabaseId));
 
 	/* FOR ALL TABLES requires superuser */
-	if (stmt->for_all_tables && !superuser())
+	if (stmt->for_all_tables && !superuser() && !is_neon_superuser())
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("must be superuser to create FOR ALL TABLES publication")));
@@ -826,7 +833,7 @@ CreatePublication(ParseState *pstate, CreatePublicationStmt *stmt)
 								   &schemaidlist);
 
 		/* FOR TABLES IN SCHEMA requires superuser */
-		if (schemaidlist != NIL && !superuser())
+		if (schemaidlist != NIL && !superuser() && !is_neon_superuser())
 			ereport(ERROR,
 					errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 					errmsg("must be superuser to create FOR TABLES IN SCHEMA publication"));

From e65f4897c79cf76aca5fb8633508b404e52c175b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@neon.tech>
Date: Wed, 17 Jan 2024 10:58:13 +0200
Subject: [PATCH 13/13] Integrate Tomas Vondra prefetch implementation of heap
 pages referenced by index

---
 src/backend/access/nbtree/README         |  15 +-
 src/backend/access/nbtree/nbtsearch.c    |  98 +--
 src/backend/commands/explain.c           |   7 +-
 src/backend/executor/Makefile            |   1 +
 src/backend/executor/execMain.c          |  12 +
 src/backend/executor/execPrefetch.c      | 885 +++++++++++++++++++++++
 src/backend/executor/instrument.c        |   2 +
 src/backend/executor/nodeIndexonlyscan.c | 114 ++-
 src/backend/executor/nodeIndexscan.c     |  69 +-
 src/backend/storage/smgr/md.c            |   9 +
 src/backend/storage/smgr/smgr.c          |   8 +
 src/include/executor/executor.h          |  53 ++
 src/include/executor/instrument.h        |   1 +
 src/include/nodes/execnodes.h            |  10 +
 src/include/pg_config_manual.h           |   2 -
 src/include/storage/md.h                 |   2 +
 src/include/storage/smgr.h               |   4 +
 src/tools/pgindent/typedefs.list         |   3 +
 18 files changed, 1194 insertions(+), 101 deletions(-)
 create mode 100644 src/backend/executor/execPrefetch.c

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index bf63b519924..b1aeb2514c4 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1100,19 +1100,8 @@ We should prefetch not only leaf pages, but also the next parent page.
 The trick is to correctly calculate the moment when it will be needed:
 We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page.
 
-Currently there are two different prefetch implementations for
-index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches
-only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only
-if parallel plan is not used. Parallel index scan is using critical section for obtaining next
-page by parallel worker. Leaf page is loaded in this critical section.
-And if most of time is spent in loading the page, then it actually eliminates any concurrency
-and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in
-any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0.
-
-Prefetch for normal (not index-only) index tries to prefetch heap tuples
-referenced from leaf page. Average number of items per page
-is about 100 which is comparable with default value of effective_io_concurrency.
-So there is not so much sense trying to prefetch also next leaf page.
+Prefetching of referenced heap pages (TIDs) is performed by executor.
+Read explanation in execPrefetch.c
 
 As far as it is difficult to estimate number of entries traversed by index scan,
 we prefer not to prefetch  large number of pages from the very beginning.
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index cefb363acfe..f2aceb5103a 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -51,6 +51,8 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 
 #define INCREASE_PREFETCH_DISTANCE_STEP 1
+/* Do not prefetch too much leaves pages to avoid overflow of prefetch queue because it is also used to prefetch references heap paghes */
+#define MAX_IOS_PREFETCH_DISTANCE 8
 
 /*
  *	_bt_drop_lock_and_maybe_pin()
@@ -1174,32 +1176,25 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		}
 	}
 
-	/* Neon: initialize prefetch */
-	so->n_prefetch_requests = 0;
-	so->n_prefetch_blocks = 0;
-	so->last_prefetch_index = 0;
-	so->next_parent = P_NONE;
-	so->prefetch_maximum = IsCatalogRelation(rel)
-		? effective_io_concurrency
-		: get_tablespace_io_concurrency(rel->rd_rel->reltablespace);
-
-	if (scan->xs_want_itup) /* index only scan */
+	/* Neon: initialize prefetch of leaves pages for index-only scan.
+	 * Prefetching of references HEAP pages is done on executor.
+	 * We disable prefetch for parallel index-only scan.
+	 * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker
+	 * which issued prefetch request. The logic of splitting pages between parallel workers in
+	 * index scan doesn't allow to satisfy this requirement.
+	 */
+	if (scan->xs_want_itup && enable_indexonlyscan_prefetch && !scan->parallel_scan)
 	{
-		if (enable_indexonlyscan_prefetch)
-		{
-			/* We disable prefetch for parallel index-only scan.
-			 * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker
-			 * which issued prefetch request. The logic of splitting pages between parallel workers in
-			 * index scan doesn't allow to satisfy this requirement.
-			 * Also prefetch of leave pages will be useless if expected number of rows fits in one page.
-			 */
-			if (scan->parallel_scan)
-				so->prefetch_maximum = 0;  /* disable prefetch */
-		}
-		else
-			so->prefetch_maximum = 0; /* disable prefetch */
+		so->n_prefetch_requests = 0;
+		so->n_prefetch_blocks = 0;
+		so->last_prefetch_index = 0;
+		so->next_parent = P_NONE;
+		so->prefetch_maximum = IsCatalogRelation(rel)
+			? effective_io_concurrency
+			: get_tablespace_io_concurrency(rel->rd_rel->reltablespace);
+		so->prefetch_maximum = Min(so->prefetch_maximum, MAX_IOS_PREFETCH_DISTANCE);
 	}
-	else if (!enable_indexscan_prefetch || !scan->heapRelation)
+	else
 		so->prefetch_maximum = 0; /* disable prefetch */
 
 	/* If key bounds are not specified, then we will scan the whole relation and it make sense to start with the largest possible prefetch distance */
@@ -1631,62 +1626,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 	scan->xs_heaptid = currItem->heapTid;
 
 	if (scan->xs_want_itup) /* index-only scan */
-	{
  		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
-	}
-	else if (so->prefetch_maximum > 0)
-	{
-		int prefetchLimit, prefetchDistance;
-
-		/* Neon: prefetch referenced heap pages.
-		 * As far as it is difficult to predict how much items index scan will return
-		 * we do not want to prefetch many heap pages from the very beginning because
-		 * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP
-		 * at each index scan iteration until it reaches prefetch_maximum.
-		 */
-
-		/* Advance pefetch distance until it reaches prefetch_maximum */
-		if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum)
-			so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP;
-		else
-			so->current_prefetch_distance = so->prefetch_maximum;
-
-		/* How much we can prefetch */
-		prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1);
-
-		/* Active prefeth requests */
-		prefetchDistance = so->n_prefetch_requests;
- 
-		/*
-		 * Consume one prefetch request (if any)
-		 */
-		if (prefetchDistance != 0)
-			prefetchDistance -= 1;
-
-		/* Keep number of active prefetch requests equal to the current prefetch distance.
-		 * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration,
-		 * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations
-		 */
-		if (ScanDirectionIsForward(dir))
-		{
-			while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem)
-			{
-				BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid);
-				PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno);
-				prefetchDistance += 1;
-			}
-		}
-		else
-		{
-			while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem)
-			{
-				BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid);
-				PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno);
-				prefetchDistance += 1;
-			}
-		}
-		so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */
-	}
 
 	return true;
 }
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 8a4f6886b91..3e56da1f2d7 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3551,11 +3551,12 @@ show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info)
 	if (es->format == EXPLAIN_FORMAT_TEXT)
 	{
 			ExplainIndentText(es);
-			appendStringInfo(es->str, "Prefetch: hits=%lld misses=%lld expired=%lld duplicates=%lld\n",
+			appendStringInfo(es->str, "Prefetch: hits=%lld misses=%lld expired=%lld duplicates=%lld tids=%lld\n",
 							 (long long) prefetch_info->hits,
 							 (long long) prefetch_info->misses,
 							 (long long) prefetch_info->expired,
-							 (long long) prefetch_info->duplicates);
+							 (long long) prefetch_info->duplicates,
+							 (long long) prefetch_info->tids);
 	}
 	else
 	{
@@ -3567,6 +3568,8 @@ show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info)
 							   prefetch_info->expired, es);
 		ExplainPropertyInteger("Prefetch Duplicated Requests", NULL,
 							   prefetch_info->duplicates, es);
+		ExplainPropertyInteger("Prefetch Index Tids", NULL,
+							   prefetch_info->tids, es);
 	}
 }
 
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index 11118d0ce02..840f5a6596a 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -24,6 +24,7 @@ OBJS = \
 	execMain.o \
 	execParallel.o \
 	execPartition.o \
+	execPrefetch.o \
 	execProcnode.o \
 	execReplication.o \
 	execSRF.o \
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 4c5a7bbf620..0d30c2e2f52 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1645,6 +1645,18 @@ ExecutePlan(EState *estate,
 	 */
 	estate->es_direction = direction;
 
+	/*
+	 * Enable prefetching only if the plan is executed exactly once. We need
+	 * to disable prefetching for cases when the scan direction may change
+	 * (e.g. for scrollable cursors).
+	 *
+	 * XXX It might be possible to improve the prefetching code to handle this
+	 * by "walking back" the TID queue, but it's not clear if it's worth it.
+	 * And if there pauses in between the fetches, the prefetched pages may
+	 * get evicted, wasting the prefetch effort.
+	 */
+	estate->es_use_prefetching = execute_once;
+
 	/*
 	 * If the plan might potentially be executed multiple times, we must force
 	 * it to run without parallelism, because we might exit early.
diff --git a/src/backend/executor/execPrefetch.c b/src/backend/executor/execPrefetch.c
new file mode 100644
index 00000000000..3d1c58534cd
--- /dev/null
+++ b/src/backend/executor/execPrefetch.c
@@ -0,0 +1,885 @@
+/*-------------------------------------------------------------------------
+ *
+ * execPrefetch.c
+ *	  routines for prefetching heap pages for index scans.
+ *
+ * The IndexPrefetch node represents an "index prefetcher" which reads TIDs
+ * from an index scan, and prefetches the referenced heap pages. The basic
+ * API consists of these methods:
+ *
+ *	IndexPrefetchAlloc - allocate IndexPrefetch with custom callbacks
+ *	IndexPrefetchNext - read next TID from the index scan, do prefetches
+ *	IndexPrefetchReset - reset state of the prefetcher (for rescans)
+ *	IndexPrefetchEnd - release resources held by the prefetcher
+ *
+ * When allocating a prefetcher, the caller can supply two custom callbacks:
+ *
+ *	IndexPrefetchNextCB - reads the next TID from the index scan (required)
+ *	IndexPrefetchCleanupCB - release private prefetch data (optional)
+ *
+ * These callbacks allow customizing the behavior for different types of
+ * index scans - for exampel index-only scans may inspect visibility map,
+ * and adjust prefetches based on that.
+ *
+ *
+ * TID queue
+ * ---------
+ * The prefetcher maintains a simple queue of TIDs fetched from the index.
+ * The length of the queue (number of TIDs) is determined by the prefetch
+ * target, i.e. effective_io_concurrency. Adding entries to the queue is
+ * the responsibility of IndexPrefetchFillQueue(), depending on the state
+ * of the scan etc. It also prefetches the pages, if appropriate.
+ *
+ * Note: This prefetching applies only to heap pages from the indexed
+ * relation, not the internal index pages.
+ *
+ *
+ * pattern detection
+ * -----------------
+ * For certain access patterns, prefetching is inefficient. In particular,
+ * this applies to sequential access (where kernel read-ahead works fine)
+ * and for pages that are already in memory (prefetched recently). The
+ * prefetcher attempts to identify these two cases - sequential patterns
+ * are detected by IndexPrefetchBlockIsSequential, usign a tiny queue of
+ * recently prefetched blocks. Recently prefetched blocks are tracked in
+ * a "partitioned" LRU cache.
+ *
+ * Note: These are inherently best-effort heuristics. We don't know what
+ * the kernel algorithm/configuration is, or more precisely what already
+ * is in page cache.
+ *
+ *
+ * cache of recent prefetches
+ * --------------------------
+ * Cache of recently prefetched blocks, organized as a hash table of LRU
+ * LRU caches. Doesn't need to be perfectly accurate, but we aim to make
+ * false positives/negatives reasonably low. For more details see the
+ * comments at IndexPrefetchIsCached.
+ *
+ *
+ * prefetch request number
+ * -----------------------
+ * Prefetching works with the concept of "age" (e.g. "recently prefetched
+ * pages"). This relies on a simple prefetch counter, incremented every
+ * time a prefetch is issued. This is not exactly the same thing as time,
+ * as there may be arbitrary delays, it's good enough for this purpose.
+ *
+ *
+ * auto-tuning / self-adjustment
+ * -----------------------------
+ *
+ * XXX Some ideas how to auto-tune the prefetching, so that unnecessary
+ * prefetching does not cause significant regressions (e.g. for nestloop
+ * with inner index scan). We could track number of rescans and number of
+ * items (TIDs) actually returned from the scan. Then we could calculate
+ * rows / rescan and adjust the prefetch target accordingly. That'd help
+ * with cases when a scan matches only very few rows, far less than the
+ * prefetchTarget, because the unnecessary prefetches are wasted I/O.
+ * Imagine a LIMIT on top of index scan, or something like that.
+ *
+ * XXX Could we tune the cache size based on execution statistics? We have
+ * a cache of limited size (PREFETCH_CACHE_SIZE = 1024 by default), but
+ * how do we know it's the right size? Ideally, we'd have a cache large
+ * enough to track actually cached blocks. If the OS caches 10240 pages,
+ * then we may do 90% of prefetch requests unnecessarily. Or maybe there's
+ * a lot of contention, blocks are evicted quickly, and 90% of the blocks
+ * in the cache are not actually cached anymore? But we do have a concept
+ * of sequential request ID (PrefetchCacheEntry->request), which gives us
+ * information about "age" of the last prefetch. Now it's used only when
+ * evicting entries (to keep the more recent one), but maybe we could also
+ * use it when deciding if the page is cached. Right now any block that's
+ * in the cache is considered cached and not prefetched, but maybe we could
+ * have "max age", and tune it based on feedback from reading the blocks
+ * later. For example, if we find the block in cache and decide not to
+ * prefetch it, but then later find we have to do I/O, it means our cache
+ * is too large. And we could "reduce" the maximum age (measured from the
+ * current prefetchRequest value), so that only more recent blocks would
+ * be considered cached. Not sure about the opposite direction, where we
+ * decide to prefetch a block - AFAIK we don't have a way to determine if
+ * I/O was needed or not in this case (so we can't increase the max age).
+ * But maybe we could di that somehow speculatively, i.e. increase the
+ * value once in a while, and see what happens.
+ *
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execPrefetch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "catalog/index.h"
+#include "common/hashfn.h"
+#include "executor/executor.h"
+#include "nodes/nodeFuncs.h"
+#include "storage/bufmgr.h"
+#include "utils/spccache.h"
+
+
+/*
+ * An entry representing a recently prefetched block. For each block we know
+ * the request number, assigned sequentially, allowing us to decide how old
+ * the request is.
+ *
+ * XXX Is it enough to keep the request as uint32? This way we can prefetch
+ * 32TB of data, and this allows us to fit the whole entry into 64B, i.e.
+ * one cacheline. Which seems like a good thing.
+ *
+ * XXX If we're extra careful / paranoid about uint32, we could reset the
+ * cache once the request wraps around.
+ */
+typedef struct IndexPrefetchCacheEntry
+{
+	BlockNumber block;
+	uint32		request;
+} IndexPrefetchCacheEntry;
+
+/*
+ * Size of the cache of recently prefetched blocks - shouldn't be too small or
+ * too large. 1024 entries seems about right, it covers ~8MB of data. This is
+ * rather arbitrary - there's no formula that'd tell us what the optimal size
+ * is, and we can't even tune it based on runtime (as it depends on what the
+ * other backends do too).
+ *
+ * A value too small would mean we may issue unnecessary prefetches for pages
+ * that have already been prefetched recently (and are still in page cache),
+ * incurring costs for unnecessary fadvise() calls.
+ *
+ * A value too large would mean we do not issue prefetches for pages that have
+ * already been evicted from memory (both shared buffers and page cache).
+ *
+ * Note however that PrefetchBuffer() checks shared buffers before doing the
+ * fadvise call, which somewhat limits the risk of a small cache - the page
+ * would have to get evicted from shared buffers not yet from page cache.
+ * Also, the cost of not issuing a fadvise call (and doing synchronous I/O
+ * later) is much higher than the unnecessary fadvise call. For these reasons
+ * it's better to keep the cache fairly small.
+ *
+ * The cache is structured as an array of small LRU caches - you may also
+ * imagine it as a hash table of LRU caches. To remember a prefetched block,
+ * the block number mapped to a LRU using by hashing. And then in each LRU
+ * we organize the entries by age (per request number) - in particular, the
+ * age determines which entry gets evicted after the LRU gets full.
+ *
+ * The LRU needs to be small enough to be searched linearly. At the same
+ * time it needs to be sufficiently large to handle collisions when several
+ * hot blocks get mapped to the same LRU. For example, if the LRU was only
+ * a single entry, and there were two hot blocks mapped to it, that would
+ * often give incorrect answer.
+ *
+ * The 8 entries per LRU seems about right - it's small enough for linear
+ * search to work well, but large enough to be adaptive. It's not very
+ * likely for 9+ busy blocks (out of 1000 recent requests) to map to the
+ * same LRU. Assuming reasonable hash function.
+ *
+ * XXX Maybe we could consider effective_cache_size when sizing the cache?
+ * Not to size the cache for that, ofc, but maybe as a guidance of how many
+ * heap pages it might keep. Maybe just a fraction fraction of the value,
+ * say Max(8MB, effective_cache_size / max_connections) or something.
+ */
+#define		PREFETCH_LRU_SIZE		8	/* slots in one LRU */
+#define		PREFETCH_LRU_COUNT		128 /* number of LRUs */
+#define		PREFETCH_CACHE_SIZE		(PREFETCH_LRU_SIZE * PREFETCH_LRU_COUNT)
+
+/*
+ * Size of small sequential queue of most recently prefetched blocks, used
+ * to check if the block is exactly the same as the immediately preceding
+ * one (in which case prefetching is not needed), and if the blocks are a
+ * sequential pattern (in which case the kernel read-ahead is likely going
+ * to be more efficient, and we don't want to interfere with it).
+ */
+#define		PREFETCH_QUEUE_HISTORY	8
+
+/*
+ * An index prefetcher, which maintains a queue of TIDs from an index, and
+ * issues prefetches (if deemed beneficial and supported by the OS).
+ */
+typedef struct IndexPrefetch
+{
+	int			prefetchTarget; /* how far we should be prefetching */
+	int			prefetchMaxTarget;	/* maximum prefetching distance */
+	int			prefetchReset;	/* reset to this distance on rescan */
+	bool		prefetchDone;	/* did we get all TIDs from the index? */
+	bool        skipSequential; /* do nt prefetch for sequential access pattern */
+
+	/* runtime statistics, displayed in EXPLAIN etc. */
+	uint32		countAll;		/* all prefetch requests (including skipped) */
+	uint32		countPrefetch;	/* PrefetchBuffer calls */
+	uint32		countSkipSequential;	/* skipped as sequential pattern */
+	uint32		countSkipCached;	/* skipped as recently prefetched */
+
+	/*
+	 * Queue of TIDs to prefetch.
+	 *
+	 * XXX Sizing for MAX_IO_CONCURRENCY may be overkill, but it seems simpler
+	 * than dynamically adjusting for custom values. However, 1000 entries
+	 * means ~16kB, which means an oversized chunk, and thus always a malloc()
+	 * call. However, we already have the prefetchCache, which is also large
+	 * enough to cause this :-(
+	 *
+	 * XXX However what about the case without prefetching? In that case it
+	 * would be nice to lower the malloc overhead, maybe?
+	 */
+	IndexPrefetchEntry queueItems[MAX_IO_CONCURRENCY];
+	uint32		queueIndex;		/* next TID to prefetch */
+	uint32		queueStart;		/* first valid TID in queue */
+	uint32		queueEnd;		/* first invalid (empty) TID in queue */
+
+	/*
+	 * A couple of last prefetched blocks, used to check for certain access
+	 * pattern and skip prefetching - e.g. for sequential access).
+	 *
+	 * XXX Separate from the main queue, because we only want to compare the
+	 * block numbers, not the whole TID. In sequential access it's likely we
+	 * read many items from each page, and we don't want to check many items
+	 * (as that is much more expensive).
+	 */
+	BlockNumber blockItems[PREFETCH_QUEUE_HISTORY];
+	uint32		blockIndex;		/* index in the block (points to the first
+								 * empty entry) */
+
+	/*
+	 * Cache of recently prefetched blocks, organized as a hash table of small
+	 * LRU caches.
+	 */
+	uint32		prefetchRequest;
+	IndexPrefetchCacheEntry prefetchCache[PREFETCH_CACHE_SIZE];
+
+
+	/*
+	 * Callback to customize the prefetch (decide which block need to be
+	 * prefetched, etc.)
+	 */
+	IndexPrefetchNextCB next_cb;	/* read next TID */
+	IndexPrefetchCleanupCB cleanup_cb;	/* cleanup data */
+
+	/*
+	 * If a callback is specified, it may store global state (for all TIDs).
+	 * For example VM buffer may be kept during IOS. This is similar to the
+	 * data field in IndexPrefetchEntry, but that's per-TID.
+	 */
+	void	   *data;
+} IndexPrefetch;
+
+/* small sequential queue of recent blocks */
+#define PREFETCH_BLOCK_INDEX(v)	((v) % PREFETCH_QUEUE_HISTORY)
+
+/* access to the main hybrid cache (hash of LRUs) */
+#define PREFETCH_LRU_ENTRY(p, lru, idx)	\
+	&((p)->prefetchCache[(lru) * PREFETCH_LRU_SIZE + (idx)])
+
+/* access to queue of TIDs (up to MAX_IO_CONCURRENCY elements) */
+#define PREFETCH_QUEUE_INDEX(a)	((a) % (MAX_IO_CONCURRENCY))
+#define PREFETCH_QUEUE_EMPTY(p)	((p)->queueEnd == (p)->queueIndex)
+
+/*
+ * macros to deal with prefetcher state
+ *
+ * FIXME may need rethinking, easy to confuse PREFETCH_ENABLED/PREFETCH_ACTIVE
+ */
+#define PREFETCH_ENABLED(p)		((p) && ((p)->prefetchMaxTarget > 0))
+#define PREFETCH_QUEUE_FULL(p)		((p)->queueEnd - (p)->queueIndex == (p)->prefetchTarget)
+#define PREFETCH_DONE(p)		((p) && ((p)->prefetchDone && PREFETCH_QUEUE_EMPTY(p)))
+#define PREFETCH_ACTIVE(p)		(PREFETCH_ENABLED(p) && !(p)->prefetchDone)
+
+
+/*
+ * IndexPrefetchBlockIsSequential
+ *		Track the block number and check if the I/O pattern is sequential,
+ *		or if the block is the same as the immediately preceding one.
+ *
+ * This also updates the small sequential cache of blocks.
+ *
+ * The prefetching overhead is fairly low, but for some access patterns the
+ * benefits are small compared to the extra overhead, or the prefetching may
+ * even be harmful. In particular, for sequential access the read-ahead
+ * performed by the OS is very effective/efficient and our prefetching may
+ * be pointless or (worse) even interfere with it.
+ *
+ * This identifies simple sequential patterns, using a tiny queue of recently
+ * prefetched block numbers (PREFETCH_QUEUE_HISTORY blocks). It also checks
+ * if the block is exactly the same as any of the blocks in the queue (the
+ * main cache has block too, but checking the tiny cache is likely cheaper).
+ *
+ * The the main prefetch queue is not really useful for this, as it stores
+ * full TIDs, but while we only care about block numbers. Consider a nicely
+ * clustered table, with a perfectly sequential pattern when accessed through
+ * an index. Each heap page may have dozens of TIDs, filling the prefetch
+ * queue. But we need to compare block numbers - those may either not be
+ * in the queue anymore, or we have to walk many TIDs (making it expensive,
+ * and we're in hot path).
+ *
+ * So a tiny queue of just block numbers seems like a better option.
+ *
+ * Returns true if the block is in a sequential pattern or was prefetched
+ * recently (and so should not be prefetched this time), or false (in which
+ * case it should be prefetched).
+ */
+static bool
+IndexPrefetchBlockIsSequential(IndexPrefetch *prefetch, BlockNumber block)
+{
+	int			idx;
+
+	/*
+	 * If the block queue is empty, just store the block and we're done (it's
+	 * neither a sequential pattern, neither recently prefetched block).
+	 */
+	if (prefetch->blockIndex == 0)
+	{
+		prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block;
+		prefetch->blockIndex++;
+		return false;
+	}
+
+	/*
+	 * Check if it's the same as the immediately preceding block. We don't
+	 * want to prefetch the same block over and over (which would happen for
+	 * well correlated indexes).
+	 *
+	 * In principle we could rely on IndexPrefetchIsCached doing this using
+	 * the full cache, but this check is much cheaper and we need to look at
+	 * the preceding block anyway, so we just do it.
+	 *
+	 * Notice we haven't added the block to the block queue yet, and there
+	 * is a preceding block (i.e. blockIndex-1 is valid).
+	 */
+	if (prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex - 1)] == block)
+		return true;
+
+	/*
+	 * Add the block number to the small queue.
+	 *
+	 * Done before checking if the pattern is sequential, because we want to
+	 * know about the block later, even if we end up skipping the prefetch.
+	 * Otherwise we'd not be able to detect longer sequential pattens - we'd
+	 * skip one block and then fail to skip the next couple blocks even in a
+	 * perfectly sequential pattern. And this ocillation might even prevent
+	 * the OS read-ahead from kicking in.
+	 */
+	prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block;
+	prefetch->blockIndex++;
+
+	/*
+	 * Are there enough requests to confirm a sequential pattern? We only
+	 * consider something to be sequential after finding a sequence of
+	 * PREFETCH_QUEUE_HISTORY blocks.
+	 */
+	if (prefetch->blockIndex < PREFETCH_QUEUE_HISTORY)
+		return false;
+
+	/*
+	 * Check if the last couple blocks are in a sequential pattern. We look
+	 * for a sequential pattern of PREFETCH_QUEUE_HISTORY (8 by default), so
+	 * we look for patterns of 8 pages (64kB) including the new block.
+	 *
+	 * XXX Could it be harmful that we read the queue backwards? Maybe memory
+	 * prefetching works better for the forward direction?
+	 */
+	for (int i = 1; i < PREFETCH_QUEUE_HISTORY; i++)
+	{
+		/*
+		 * Calculate index of the earlier block (we need to do -1 as we
+		 * already incremented the index after adding the new block to the
+		 * queue). So (blockIndex-1) is the new block.
+		 */
+		idx = PREFETCH_BLOCK_INDEX(prefetch->blockIndex - i - 1);
+
+		/*
+		 * For a sequential pattern, blocks "k" step ago needs to have block
+		 * number by "k" smaller compared to the current block.
+		 */
+		if (prefetch->blockItems[idx] != (block - i))
+			return false;
+
+		/* Don't prefetch if the block happens to be the same. */
+		if (prefetch->blockItems[idx] == block)
+			return false;
+	}
+
+	/* not sequential, not recently prefetched */
+	return true;
+}
+
+/*
+ * IndexPrefetchIsCached
+ *		Check if the block was prefetched recently, and update the cache.
+ *
+ * We don't want to prefetch blocks that we already prefetched recently. It's
+ * cheap but not free, and the overhead may be quite significant.
+ *
+ * We want to remember which blocks were prefetched recently, so that we can
+ * skip repeated prefetches. We also need to eventually forget these blocks
+ * as they may get evicted from memory (particularly page cache, which is
+ * outside our control).
+ *
+ * A simple queue is not a viable option - it would allow expiring requests
+ * based on age, but it's very expensive to check (as it requires linear
+ * search, and we need fairly large number of entries). Hash table does not
+ * work because it does not allow expiring entries by age.
+ *
+ * The cache does not need to be perfect - false positives/negatives are
+ * both acceptable, as long as the rate is reasonably low.
+ *
+ * We use a hybrid cache that is organized as many small LRU caches. Each
+ * block is mapped to a particular LRU by hashing (so it's a bit like a
+ * hash table of LRUs). The LRU caches are tiny (e.g. 8 entries), and the
+ * expiration happens at the level of a single LRU (using age determined
+ * by sequential request number).
+ *
+ * This allows quick searches and expiration, with false negatives (when a
+ * particular LRU has too many collisions with hot blocks, we may end up
+ * evicting entries that are more recent than some other LRU).
+ *
+ * For example, imagine 128 LRU caches, each with 8 entries - that's 1024
+ * request in total (these are the default parameters.) representing about
+ * 8MB of data.
+ *
+ * If we want to check if a block was recently prefetched, we calculate
+ * (hash(blkno) % 128) and search only LRU at this index, using a linear
+ * search. If we want to add the block to the cache, we find either an
+ * empty slot or the "oldest" entry in the LRU, and store the block in it.
+ * If the block is already in the LRU, we only update the request number.
+ *
+ * The request age is determined using a prefetch counter, incremented every
+ * time we end up prefetching a block. The counter is uint32, so it should
+ * not wrap (we'd have to prefetch 32TB).
+ *
+ * If the request number is not less than PREFETCH_CACHE_SIZE ago, it's
+ * considered "recently prefetched". That is, the maximum age is the same
+ * as the total capacity of the cache.
+ *
+ * Returns true if the block was recently prefetched (and thus we don't
+ * need to prefetch it again), or false (should do a prefetch).
+ *
+ * XXX It's a bit confusing these return values are inverse compared to
+ * what IndexPrefetchBlockIsSequential does.
+ *
+ * XXX Should we increase the prefetch counter even if we determine the
+ * entry was recently prefetched? Then we might skip some request numbers
+ * (there's be no entry with them).
+ */
+static bool
+IndexPrefetchIsCached(IndexPrefetch *prefetch, BlockNumber block)
+{
+	IndexPrefetchCacheEntry *entry;
+
+	/* map the block number the the LRU */
+	int			lru;
+
+	/* age/index of the oldest entry in the LRU, to maybe use */
+	uint64		oldestRequest = PG_UINT64_MAX;
+	int			oldestIndex = -1;
+
+	/*
+	 * First add the block to the (tiny) queue and see if it's part of a
+	 * sequential pattern. In this case we just ignore the block and don't
+	 * prefetch it - we expect OS read-ahead to do a better job.
+	 *
+	 * XXX Maybe we should still add the block to the main cache, in case we
+	 * happen to access it later. That might help if we happen to scan a lot
+	 * of the table sequentially, and then randomly. Not sure that's very
+	 * likely with index access, though.
+	 */
+	if (prefetch->skipSequential && IndexPrefetchBlockIsSequential(prefetch, block))
+	{
+		prefetch->countSkipSequential++;
+		return true;
+	}
+
+	/* Which LRU does this block belong to? */
+	lru = hash_uint32(block) % PREFETCH_LRU_COUNT;
+
+	/*
+	 * Did we prefetch this block recently? Scan the LRU linearly, and while
+	 * doing that, track the oldest (or empty) entry, so that we know where to
+	 * put the block if we don't find a match.
+	 */
+	for (int i = 0; i < PREFETCH_LRU_SIZE; i++)
+	{
+		entry = PREFETCH_LRU_ENTRY(prefetch, lru, i);
+
+		/*
+		 * Is this the oldest prefetch request in this LRU?
+		 *
+		 * Notice that request is uint32, so an empty entry (with request=0)
+		 * is automatically oldest one.
+		 */
+		if (entry->request < oldestRequest)
+		{
+			oldestRequest = entry->request;
+			oldestIndex = i;
+		}
+
+		/* Skip unused entries. */
+		if (entry->request == 0)
+			continue;
+
+		/* Is this entry for the same block as the current request? */
+		if (entry->block == block)
+		{
+			bool		prefetched;
+
+			/*
+			 * Is the old request sufficiently recent? If yes, we treat the
+			 * block as already prefetched. We need to check before updating
+			 * the prefetch request.
+			 *
+			 * XXX We do add the cache size to the request in order not to
+			 * have issues with underflows.
+			 */
+			prefetched = ((entry->request + PREFETCH_CACHE_SIZE) >= prefetch->prefetchRequest);
+
+			prefetch->countSkipCached += (prefetched) ? 1 : 0;
+
+			/* Update the request number. */
+			entry->request = ++prefetch->prefetchRequest;
+
+			return prefetched;
+		}
+	}
+
+	/*
+	 * We didn't find the block in the LRU, so store it the "oldest" prefetch
+	 * request in this LRU (which might be an empty entry).
+	 */
+	Assert((oldestIndex >= 0) && (oldestIndex < PREFETCH_LRU_SIZE));
+
+	entry = PREFETCH_LRU_ENTRY(prefetch, lru, oldestIndex);
+
+	entry->block = block;
+	entry->request = ++prefetch->prefetchRequest;
+
+	/* not in the prefetch cache */
+	return false;
+}
+
+/*
+ * IndexPrefetchHeapPage
+ *		Prefetch a heap page for the TID, unless it's sequential or was
+ *		recently prefetched.
+ */
+static void
+IndexPrefetchHeapPage(IndexScanDesc scan, IndexPrefetch *prefetch, IndexPrefetchEntry *entry)
+{
+	BlockNumber block = ItemPointerGetBlockNumber(&entry->tid);
+
+	prefetch->countAll++;
+
+	/*
+	 * Do not prefetch the same block over and over again, if it's probably
+	 * still in memory (page cache).
+	 *
+	 * This happens e.g. for clustered or naturally correlated indexes (fkey
+	 * to a sequence ID). It's not expensive (the block is in page cache
+	 * already, so no I/O), but it's not free either.
+	 *
+	 * If we make a mistake and prefetch a buffer that's still in our shared
+	 * buffers, PrefetchBuffer will take care of that. If it's in page cache,
+	 * we'll issue an unnecessary prefetch. There's not much we can do about
+	 * that, unfortunately.
+	 *
+	 * XXX Maybe we could check PrefetchBufferResult and adjust countPrefetch
+	 * based on that?
+	 */
+	if (IndexPrefetchIsCached(prefetch, block))
+		return;
+
+	prefetch->countPrefetch++;
+
+	PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, block);
+	pgBufferUsage.prefetch.tids += 1;
+}
+
+/*
+ * IndexPrefetchFillQueue
+ *		Fill the prefetch queue and issue necessary prefetch requests.
+ *
+ * If the prefetching is still active (enabled, not reached end of scan), read
+ * TIDs into the queue until we hit the current target.
+ *
+ * This also ramps-up the prefetch target from 0 to prefetch_max, determined
+ * when allocating the prefetcher.
+ */
+static void
+IndexPrefetchFillQueue(IndexScanDesc scan, IndexPrefetch *prefetch, ScanDirection direction)
+{
+	/* When inactive (not enabled or end of scan reached), we're done. */
+	if (!PREFETCH_ACTIVE(prefetch))
+		return;
+
+	/*
+	 * Ramp up the prefetch distance incrementally.
+	 *
+	 * Intentionally done as first, before reading the TIDs into the queue, so
+	 * that there's always at least one item. Otherwise we might get into a
+	 * situation where we start with target=0 and no TIDs loaded.
+	 */
+	prefetch->prefetchTarget = Min(prefetch->prefetchTarget + 1,
+								   prefetch->prefetchMaxTarget);
+
+	/*
+	 * Read TIDs from the index until the queue is full (with respect to the
+	 * current prefetch target).
+	 */
+	while (!PREFETCH_QUEUE_FULL(prefetch))
+	{
+		IndexPrefetchEntry *entry
+		= prefetch->next_cb(scan, direction, prefetch->data);
+
+		/* no more entries in this index scan */
+		if (entry == NULL)
+		{
+			prefetch->prefetchDone = true;
+			return;
+		}
+
+		Assert(ItemPointerEquals(&entry->tid, &scan->xs_heaptid));
+
+		/* store the entry and then maybe issue the prefetch request */
+		prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd++)] = *entry;
+
+		/* issue the prefetch request? */
+		if (entry->prefetch)
+			IndexPrefetchHeapPage(scan, prefetch, entry);
+	}
+}
+
+/*
+ * IndexPrefetchNextEntry
+ *		Get the next entry from the prefetch queue (or from the index directly).
+ *
+ * If prefetching is enabled, get next entry from the prefetch queue (unless
+ * queue is empty). With prefetching disabled, read an entry directly from the
+ * index scan.
+ *
+ * XXX not sure this correctly handles xs_heap_continue - see index_getnext_slot,
+ * maybe nodeIndexscan needs to do something more to handle this? Although, that
+ * should be in the indexscan next_cb callback, probably.
+ *
+ * XXX If xs_heap_continue=true, we need to return the last TID.
+ */
+static IndexPrefetchEntry *
+IndexPrefetchNextEntry(IndexScanDesc scan, IndexPrefetch *prefetch, ScanDirection direction)
+{
+	IndexPrefetchEntry *entry = NULL;
+
+	/*
+	 * With prefetching enabled (even if we already finished reading all TIDs
+	 * from the index scan), we need to return a TID from the queue.
+	 * Otherwise, we just get the next TID from the scan directly.
+	 */
+	if (PREFETCH_ENABLED(prefetch))
+	{
+		/* Did we reach the end of the scan and the queue is empty? */
+		if (PREFETCH_DONE(prefetch))
+			return NULL;
+
+		entry = palloc(sizeof(IndexPrefetchEntry));
+
+		entry->tid = prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueIndex)].tid;
+		entry->data = prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueIndex)].data;
+
+		prefetch->queueIndex++;
+
+		scan->xs_heaptid = entry->tid;
+	}
+	else						/* not prefetching, just do the regular work  */
+	{
+		ItemPointer tid;
+
+		/* Time to fetch the next TID from the index */
+		tid = index_getnext_tid(scan, direction);
+
+		/* If we're out of index entries, we're done */
+		if (tid == NULL)
+			return NULL;
+
+		Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+
+		entry = palloc(sizeof(IndexPrefetchEntry));
+
+		entry->tid = scan->xs_heaptid;
+		entry->data = NULL;
+	}
+
+	return entry;
+}
+
+/*
+ * IndexPrefetchComputeTarget
+ *		Calculate prefetch distance for the given heap relation.
+ *
+ * We disable prefetching when using direct I/O (when there's no page cache
+ * to prefetch into), and scans where the prefetch distance may change (e.g.
+ * for scrollable cursors).
+ *
+ * In regular cases we look at effective_io_concurrency for the tablepace
+ * (of the heap, not the index), and cap it with plan_rows.
+ *
+ * XXX We cap the target to plan_rows, becausse it's pointless to prefetch
+ * more than we expect to use.
+ *
+ * XXX Maybe we should reduce the value with parallel workers?
+ */
+int
+IndexPrefetchComputeTarget(Relation heapRel, double plan_rows, bool prefetch)
+{
+	/*
+	 * No prefetching for direct I/O.
+	 *
+	 * XXX Shouldn't we do prefetching even for direct I/O? We would only
+	 * pretend doing it now, ofc, because we'd not do posix_fadvise(), but
+	 * once the code starts loading into shared buffers, that'd work.
+	 */
+	if ((io_direct_flags & IO_DIRECT_DATA) != 0)
+		return 0;
+
+	/* disable prefetching (for cursors etc.) */
+	if (!prefetch)
+		return 0;
+
+	/* regular case, look at tablespace effective_io_concurrency */
+	return Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+			   plan_rows);
+}
+
+/*
+ * IndexPrefetchAlloc
+ *		Allocate the index prefetcher.
+ *
+ * The behavior is customized by two callbacks - next_cb, which generates TID
+ * values to put into the prefetch queue, and (optional) cleanup_cb which
+ * releases resources at the end.
+ *
+ * prefetch_max specifies the maximum prefetch distance, i.e. how many TIDs
+ * ahead to keep in the prefetch queue. prefetch_max=0 means prefetching is
+ * disabled.
+ *
+ * data may point to a custom data, associated with the prefetcher.
+ */
+IndexPrefetch *
+IndexPrefetchAlloc(IndexPrefetchNextCB next_cb, IndexPrefetchCleanupCB cleanup_cb,
+				   int prefetch_max, bool skip_sequential, void *data)
+{
+	IndexPrefetch *prefetch = palloc0(sizeof(IndexPrefetch));
+
+	/* the next_cb callback is required */
+	Assert(next_cb);
+
+	/* valid prefetch distance */
+	Assert((prefetch_max >= 0) && (prefetch_max <= MAX_IO_CONCURRENCY));
+
+	prefetch->queueIndex = 0;
+	prefetch->queueStart = 0;
+	prefetch->queueEnd = 0;
+
+	prefetch->prefetchTarget = 0;
+	prefetch->prefetchMaxTarget = prefetch_max;
+	prefetch->skipSequential = skip_sequential;
+	/*
+	 * Customize the prefetch to also check visibility map and keep the result
+	 * so that IOS does not need to repeat it.
+	 */
+	prefetch->next_cb = next_cb;
+	prefetch->cleanup_cb = cleanup_cb;
+	prefetch->data = data;
+
+	return prefetch;
+}
+
+/*
+ * IndexPrefetchNext
+ *		Read the next entry from the prefetch queue.
+ *
+ * Returns the next TID in the prefetch queue (which might have been prefetched
+ * sometime in the past). If needed, it adds more entries to the queue and does
+ * the prefetching for them.
+ *
+ * Returns IndexPrefetchEntry with the TID and optional data associated with
+ * the TID in the next_cb callback.
+ */
+IndexPrefetchEntry *
+IndexPrefetchNext(IndexScanDesc scan, IndexPrefetch *prefetch, ScanDirection direction)
+{
+	/* Do prefetching (if requested/enabled). */
+	IndexPrefetchFillQueue(scan, prefetch, direction);
+
+	/* Read the TID from the queue (or directly from the index). */
+	return IndexPrefetchNextEntry(scan, prefetch, direction);
+}
+
+/*
+ * IndexPrefetchReset
+ *		Reset the prefetch TID, restart the prefetching.
+ *
+ * Useful during rescans etc. This also resets the prefetch target, so that
+ * each rescan does the initial prefetch ramp-up from target=0 to maximum
+ * prefetch distance.
+ */
+void
+IndexPrefetchReset(IndexScanDesc scan, IndexPrefetch *state)
+{
+	if (!state)
+		return;
+
+	state->queueIndex = 0;
+	state->queueStart = 0;
+	state->queueEnd = 0;
+
+	state->prefetchDone = false;
+	state->prefetchTarget = 0;
+}
+
+/*
+ * IndexPrefetchStats
+ *		Log basic runtime debug stats of the prefetcher.
+ *
+ * FIXME Should be only in debug builds, or something like that.
+ */
+void
+IndexPrefetchStats(IndexScanDesc scan, IndexPrefetch *state)
+{
+	if (!state)
+		return;
+
+	elog(LOG, "index prefetch stats: requests %u prefetches %u (%f) skip cached %u sequential %u",
+		 state->countAll,
+		 state->countPrefetch,
+		 state->countPrefetch * 100.0 / state->countAll,
+		 state->countSkipCached,
+		 state->countSkipSequential);
+}
+
+/*
+ * IndexPrefetchEnd
+ *		Release resources associated with the prefetcher.
+ *
+ * This is primarily about the private data the caller might have allocated
+ * in the next_cb, and stored in the data field. We don't know what the
+ * data might contain (e.g. buffers etc.), requiring additional cleanup, so
+ * we call another custom callback.
+ *
+ * Needs to be called at the end of the executor node.
+ *
+ * XXX Maybe if there's no callback, we should just pfree the data? Does
+ * not seem very useful, though.
+ */
+void
+IndexPrefetchEnd(IndexScanDesc scan, IndexPrefetch *state)
+{
+	if (!state)
+		return;
+
+	if (!state->cleanup_cb)
+		return;
+
+	state->cleanup_cb(scan, state->data);
+}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 01978c79f5c..9f29e5bdf1a 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -239,6 +239,7 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->prefetch.misses += add->prefetch.misses;
 	dst->prefetch.expired += add->prefetch.expired;
 	dst->prefetch.duplicates += add->prefetch.duplicates;
+	dst->prefetch.tids += add->prefetch.tids;
 	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
 	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
 	INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
@@ -265,6 +266,7 @@ BufferUsageAccumDiff(BufferUsage *dst,
 	dst->prefetch.misses += add->prefetch.misses - sub->prefetch.misses;
 	dst->prefetch.expired += add->prefetch.expired - sub->prefetch.expired;
 	dst->prefetch.duplicates += add->prefetch.duplicates - sub->prefetch.duplicates;
+	dst->prefetch.tids += add->prefetch.duplicates - sub->prefetch.tids;
 	INSTR_TIME_ACCUM_DIFF(dst->blk_read_time,
 						  add->blk_read_time, sub->blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 0b43a9b9699..9f03ceb130b 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -36,6 +36,7 @@
 #include "access/tupdesc.h"
 #include "access/visibilitymap.h"
 #include "executor/execdebug.h"
+#include "executor/executor.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeIndexscan.h"
 #include "miscadmin.h"
@@ -44,11 +45,14 @@
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
-
 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
 static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
 							TupleDesc itupdesc);
-
+static IndexPrefetchEntry *IndexOnlyPrefetchNext(IndexScanDesc scan,
+												 ScanDirection direction,
+												 void *data);
+static void IndexOnlyPrefetchCleanup(IndexScanDesc scan,
+									 void *data);
 
 /* ----------------------------------------------------------------
  *		IndexOnlyNext
@@ -65,6 +69,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
 	ItemPointer tid;
+	IndexPrefetch *prefetch;
+	IndexPrefetchEntry *entry;
 
 	/*
 	 * extract necessary information from index scan node
@@ -78,11 +84,14 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	direction = ScanDirectionCombine(estate->es_direction,
 									 ((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir);
 	scandesc = node->ioss_ScanDesc;
+	prefetch = node->ioss_prefetch;
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
 	if (scandesc == NULL)
 	{
+		int	prefetch_max;
+
 		/*
 		 * We reach here if the index only scan is not parallel, or if we're
 		 * serially executing an index only scan that was planned to be
@@ -111,15 +120,40 @@ IndexOnlyNext(IndexOnlyScanState *node)
 						 node->ioss_NumScanKeys,
 						 node->ioss_OrderByKeys,
 						 node->ioss_NumOrderByKeys);
+
+		/*
+		 * Also initialize index prefetcher. We do this even when prefetching is
+		 * not done (see IndexPrefetchComputeTarget), because the prefetcher is
+		 * used for all index reads.
+		 *
+		 * XXX Maybe we should reduce the target in case this is a parallel index
+		 * scan. We don't want to issue a multiple of effective_io_concurrency.
+		 *
+		 * XXX Maybe rename the object to "index reader" or something?
+		 */
+		prefetch_max = IndexPrefetchComputeTarget(node->ss.ss_currentRelation,
+												  node->ss.ps.plan->plan_rows,
+												  estate->es_use_prefetching);
+
+		node->ioss_prefetch = IndexPrefetchAlloc(IndexOnlyPrefetchNext,
+												 IndexOnlyPrefetchCleanup,
+												 prefetch_max,
+												 smgr_support_read_ahead(RelationGetSmgr(node->ioss_RelationDesc)),
+												 palloc0(sizeof(Buffer)));
 	}
 
 	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
-	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	while ((entry = IndexPrefetchNext(scandesc, prefetch, direction)) != NULL)
 	{
+		bool	   *all_visible = NULL;
 		bool		tuple_from_heap = false;
 
+		/* unpack the entry */
+		tid = &entry->tid;
+		all_visible = (bool *) entry->data; /* result of visibility check */
+
 		CHECK_FOR_INTERRUPTS();
 
 		/*
@@ -155,8 +189,12 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 *
 		 * It's worth going through this complexity to avoid needing to lock
 		 * the VM buffer, which could cause significant contention.
+		 *
+		 * XXX Skip if we already know the page is all visible from
+		 * prefetcher.
 		 */
-		if (!VM_ALL_VISIBLE(scandesc->heapRelation,
+		if (!(all_visible && *all_visible) &&
+			!VM_ALL_VISIBLE(scandesc->heapRelation,
 							ItemPointerGetBlockNumber(tid),
 							&node->ioss_VMBuffer))
 		{
@@ -353,6 +391,9 @@ ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
 					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
 					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
 
+	/* also reset the prefetcher, so that we start from scratch */
+	IndexPrefetchReset(node->ioss_ScanDesc, node->ioss_prefetch);
+
 	ExecScanReScan(&node->ss);
 }
 
@@ -380,6 +421,12 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node)
 		node->ioss_VMBuffer = InvalidBuffer;
 	}
 
+	/* XXX Print some debug stats. Should be removed. */
+	IndexPrefetchStats(indexScanDesc, node->ioss_prefetch);
+
+	/* Release VM buffer pin from prefetcher, if any. */
+	IndexPrefetchEnd(indexScanDesc, node->ioss_prefetch);
+
 	/*
 	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
 	 */
@@ -731,3 +778,62 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
 					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
 }
+
+/*
+ * When prefetching for IOS, we want to only prefetch pages that are not
+ * marked as all-visible (because not fetching all-visible pages is the
+ * point of IOS).
+ *
+ * The buffer used by the VM_ALL_VISIBLE() check is reused, similarly to
+ * ioss_VMBuffer (maybe we could/should use it here too?). We also keep
+ * the result of the all_visible flag, so that the main loop does not to
+ * do it again.
+ */
+static IndexPrefetchEntry *
+IndexOnlyPrefetchNext(IndexScanDesc scan, ScanDirection direction, void *data)
+{
+	IndexPrefetchEntry *entry = NULL;
+	ItemPointer tid;
+
+	Assert(data);
+
+	if ((tid = index_getnext_tid(scan, direction)) != NULL)
+	{
+		BlockNumber blkno = ItemPointerGetBlockNumber(tid);
+
+		bool		all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+												 blkno,
+												 (Buffer *) data);
+
+		entry = palloc0(sizeof(IndexPrefetchEntry));
+
+		entry->tid = *tid;
+
+		/* prefetch only if not all visible */
+		entry->prefetch = !all_visible;
+
+		/* store the all_visible flag in the private part of the entry */
+		entry->data = palloc(sizeof(bool));
+		*(bool *) entry->data = all_visible;
+	}
+
+	return entry;
+}
+
+/*
+ * For IOS, we may have a VM buffer in the private data, so make sure to
+ * release it properly.
+ */
+static void
+IndexOnlyPrefetchCleanup(IndexScanDesc scan, void *data)
+{
+	Buffer	   *buffer = (Buffer *) data;
+
+	Assert(data);
+
+	if (*buffer != InvalidBuffer)
+	{
+		ReleaseBuffer(*buffer);
+		*buffer = InvalidBuffer;
+	}
+}
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 4540c7781d2..9cc6a09bb9f 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -34,6 +34,7 @@
 #include "access/tableam.h"
 #include "catalog/pg_am.h"
 #include "executor/execdebug.h"
+#include "executor/executor.h"
 #include "executor/nodeIndexscan.h"
 #include "lib/pairingheap.h"
 #include "miscadmin.h"
@@ -69,6 +70,9 @@ static void reorderqueue_push(IndexScanState *node, TupleTableSlot *slot,
 							  Datum *orderbyvals, bool *orderbynulls);
 static HeapTuple reorderqueue_pop(IndexScanState *node);
 
+static IndexPrefetchEntry *IndexScanPrefetchNext(IndexScanDesc scan,
+												 ScanDirection direction,
+												 void *data);
 
 /* ----------------------------------------------------------------
  *		IndexNext
@@ -85,6 +89,8 @@ IndexNext(IndexScanState *node)
 	ScanDirection direction;
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
+	IndexPrefetch *prefetch;
+	IndexPrefetchEntry *entry;
 
 	/*
 	 * extract necessary information from index scan node
@@ -98,11 +104,14 @@ IndexNext(IndexScanState *node)
 	direction = ScanDirectionCombine(estate->es_direction,
 									 ((IndexScan *) node->ss.ps.plan)->indexorderdir);
 	scandesc = node->iss_ScanDesc;
+	prefetch = node->iss_prefetch;
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
 	if (scandesc == NULL)
 	{
+		int prefetch_max;
+
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
@@ -123,15 +132,44 @@ IndexNext(IndexScanState *node)
 			index_rescan(scandesc,
 						 node->iss_ScanKeys, node->iss_NumScanKeys,
 						 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
+
+		/*
+		 * Also initialize index prefetcher. We do this even when prefetching is
+		 * not done (see IndexPrefetchComputeTarget), because the prefetcher is
+		 * used for all index reads.
+		 *
+		 * XXX Maybe we should reduce the target in case this is a parallel index
+		 * scan. We don't want to issue a multiple of effective_io_concurrency.
+		 *
+		 * XXX Maybe rename the object to "index reader" or something?
+		 */
+		prefetch_max = IndexPrefetchComputeTarget(node->ss.ss_currentRelation,
+												  node->ss.ps.plan->plan_rows,
+												  estate->es_use_prefetching);
+
+		node->iss_prefetch = IndexPrefetchAlloc(IndexScanPrefetchNext,
+												NULL, /* no extra cleanup */
+												prefetch_max,
+												smgr_support_read_ahead(RelationGetSmgr(node->iss_RelationDesc)),
+												NULL);
 	}
 
 	/*
 	 * ok, now that we have what we need, fetch the next tuple.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	while ((entry = IndexPrefetchNext(scandesc, prefetch, direction)) != NULL)
 	{
 		CHECK_FOR_INTERRUPTS();
 
+		/*
+		 * Fetch the next (or only) visible heap tuple for this index entry.
+		 * If we don't find anything, loop around and grab the next TID from
+		 * the index.
+		 */
+		Assert(ItemPointerIsValid(&scandesc->xs_heaptid));
+		if (!index_fetch_heap(scandesc, slot))
+			continue;
+
 		/*
 		 * If the index was lossy, we have to recheck the index quals using
 		 * the fetched tuple.
@@ -588,6 +626,9 @@ ExecReScanIndexScan(IndexScanState *node)
 					 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
 	node->iss_ReachedEnd = false;
 
+	/* also reset the prefetcher, so that we start from scratch */
+	IndexPrefetchReset(node->iss_ScanDesc, node->iss_prefetch);
+
 	ExecScanReScan(&node->ss);
 }
 
@@ -794,6 +835,9 @@ ExecEndIndexScan(IndexScanState *node)
 	indexRelationDesc = node->iss_RelationDesc;
 	indexScanDesc = node->iss_ScanDesc;
 
+	/* XXX Print some debug stats. Should be removed. */
+	IndexPrefetchStats(indexScanDesc, node->iss_prefetch);
+
 	/*
 	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
 	 */
@@ -1744,3 +1788,26 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 					 node->iss_ScanKeys, node->iss_NumScanKeys,
 					 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
 }
+
+/*
+ * XXX not sure this correctly handles xs_heap_continue - see index_getnext_slot,
+ * maybe nodeIndexscan needs to do something more to handle this?
+ */
+static IndexPrefetchEntry *
+IndexScanPrefetchNext(IndexScanDesc scan, ScanDirection direction, void *data)
+{
+	IndexPrefetchEntry *entry = NULL;
+	ItemPointer tid;
+
+	if ((tid = index_getnext_tid(scan, direction)) != NULL)
+	{
+		entry = palloc0(sizeof(IndexPrefetchEntry));
+
+		entry->tid = *tid;
+
+		/* prefetch always */
+		entry->prefetch = true;
+	}
+
+	return entry;
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index cad698ec65d..7f0115bbb29 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1621,3 +1621,12 @@ mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
 	 */
 	return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
 }
+
+/*
+ * If underlying device support read ahead
+ */
+bool
+mdreadahead(SMgrRelation reln)
+{
+	return true; /* Assume read-ahead is supported by default */
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 94a3e7b03b4..73556296405 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -47,6 +47,7 @@ static const f_smgr smgr_md = {
 	.smgr_nblocks = mdnblocks,
 	.smgr_truncate = mdtruncate,
 	.smgr_immedsync = mdimmedsync,
+	.smgr_support_read_ahead = mdreadahead,
 };
 
 /*
@@ -748,6 +749,13 @@ smgr_end_unlogged_build(SMgrRelation reln)
 		(*reln->smgr).smgr_end_unlogged_build(reln);
 }
 
+bool
+smgr_support_read_ahead(SMgrRelation reln)
+{
+	return (*reln->smgr).smgr_support_read_ahead
+		&& (*reln->smgr).smgr_support_read_ahead(reln);
+}
+
 
 /*
  * AtEOXact_SMgr
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index ac02247947e..7be5f70f5f7 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -14,6 +14,7 @@
 #ifndef EXECUTOR_H
 #define EXECUTOR_H
 
+#include "access/genam.h"
 #include "executor/execdesc.h"
 #include "fmgr.h"
 #include "nodes/lockoptions.h"
@@ -677,4 +678,56 @@ extern ResultRelInfo *ExecLookupResultRelByOid(ModifyTableState *node,
 											   bool missing_ok,
 											   bool update_cache);
 
+/*
+ * prototypes from functions in execPrefetch.c
+ */
+
+typedef struct IndexPrefetchEntry
+{
+	ItemPointerData tid;
+
+	/* should we prefetch heap page for this TID? */
+	bool		prefetch;
+
+	/*
+	 * If a callback is specified, it may store per-tid information. The data
+	 * has to be a single palloc-ed piece of data, so that it can be easily
+	 * pfreed.
+	 *
+	 * XXX We could relax this by providing another cleanup callback, but that
+	 * seems unnecessarily complex - we expect the information to be very
+	 * simple, like bool flags or something. Easy to do in a simple struct,
+	 * and perhaps even reuse without pfree/palloc.
+	 */
+	void	   *data;
+} IndexPrefetchEntry;
+
+/*
+ * custom callback, allowing the user code to determine which TID to read
+ *
+ * If there is no TID to prefetch, the return value is expected to be NULL.
+ *
+ * Otherwise the "tid" field is expected to contain the TID to prefetch, and
+ * "data" may be set to custom information the callback needs to pass outside.
+ */
+typedef IndexPrefetchEntry *(*IndexPrefetchNextCB) (IndexScanDesc scan,
+													ScanDirection direction,
+													void *data);
+
+typedef void (*IndexPrefetchCleanupCB) (IndexScanDesc scan,
+										void *data);
+
+IndexPrefetch *IndexPrefetchAlloc(IndexPrefetchNextCB next_cb,
+								  IndexPrefetchCleanupCB cleanup_cb,
+								  int prefetch_max, bool skip_sequential, void *data);
+
+IndexPrefetchEntry *IndexPrefetchNext(IndexScanDesc scan, IndexPrefetch *state,
+									  ScanDirection direction);
+
+extern void IndexPrefetchReset(IndexScanDesc scan, IndexPrefetch *state);
+extern void IndexPrefetchStats(IndexScanDesc scan, IndexPrefetch *state);
+extern void IndexPrefetchEnd(IndexScanDesc scan, IndexPrefetch *state);
+
+extern int	IndexPrefetchComputeTarget(Relation heapRel, double plan_rows, bool prefetch);
+
 #endif							/* EXECUTOR_H  */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 3164c04dacc..23b615cb437 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -22,6 +22,7 @@ typedef struct
 	int64 misses;
 	int64 expired;
 	int64 duplicates;
+	int64 tids;
 } PrefetchInfo;
 
 /*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index c467dbf8d70..a6f082e60be 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -692,6 +692,7 @@ typedef struct EState
 	struct EPQState *es_epq_active;
 
 	bool		es_use_parallel_mode;	/* can we use parallel workers? */
+	bool		es_use_prefetching; /* can we use prefetching? */
 
 	/* The per-query shared memory area to use for parallel execution. */
 	struct dsa_area *es_query_dsa;
@@ -1531,6 +1532,9 @@ typedef struct
 	bool	   *elem_nulls;		/* array of num_elems is-null flags */
 } IndexArrayKeyInfo;
 
+/* needs to be before IndexPrefetchCallback typedef */
+typedef struct IndexPrefetch IndexPrefetch;
+
 /* ----------------
  *	 IndexScanState information
  *
@@ -1582,6 +1586,9 @@ typedef struct IndexScanState
 	bool	   *iss_OrderByTypByVals;
 	int16	   *iss_OrderByTypLens;
 	Size		iss_PscanLen;
+
+	/* prefetching */
+	IndexPrefetch *iss_prefetch;
 } IndexScanState;
 
 /* ----------------
@@ -1620,6 +1627,9 @@ typedef struct IndexOnlyScanState
 	TupleTableSlot *ioss_TableSlot;
 	Buffer		ioss_VMBuffer;
 	Size		ioss_PscanLen;
+
+	/* prefetching */
+	IndexPrefetch *ioss_prefetch;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 2870e31d087..e61e67197b9 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -158,9 +158,7 @@
  * If you change this, you probably need to adjust the error message in
  * check_effective_io_concurrency.)
  */
-#ifdef USE_POSIX_FADVISE
 #define USE_PREFETCH
-#endif
 
 /*
  * Default and maximum values for backend_flush_after, bgwriter_flush_after
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 941879ee6a8..8c6ca1dda45 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -51,4 +51,6 @@ extern int	mdsyncfiletag(const FileTag *ftag, char *path);
 extern int	mdunlinkfiletag(const FileTag *ftag, char *path);
 extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate);
 
+extern bool mdreadahead(SMgrRelation reln);
+
 #endif							/* MD_H */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index d6158a0d067..abbda105f47 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -131,6 +131,8 @@ typedef struct f_smgr
 	void		(*smgr_start_unlogged_build) (SMgrRelation reln);
 	void		(*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln);
 	void		(*smgr_end_unlogged_build) (SMgrRelation reln);
+
+	bool		(*smgr_support_read_ahead) (SMgrRelation reln);
 } f_smgr;
 
 typedef void (*smgr_init_hook_type) (void);
@@ -188,4 +190,6 @@ extern void smgr_start_unlogged_build(SMgrRelation reln);
 extern void	smgr_finish_unlogged_build_phase_1(SMgrRelation reln);
 extern void smgr_end_unlogged_build(SMgrRelation reln);
 
+extern bool	smgr_support_read_ahead(SMgrRelation reln);
+
 #endif							/* SMGR_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 85a3be4b98b..e5226d72f3e 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1173,6 +1173,9 @@ IndexOnlyScanState
 IndexOptInfo
 IndexOrderByDistance
 IndexPath
+IndexPrefetch
+IndexPrefetchCacheEntry
+IndexPrefetchEntry
 IndexRuntimeKeyInfo
 IndexScan
 IndexScanDesc